#################### Exploratory Data Analysis ######################

Station_GeoLocation_Longitute_Latitude_Elevation_EPSG_4326.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Station)
## [1] 8 4
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Station)
## Rows: 8
## Columns: 4
## $ longitude     <dbl> 73.0167, 80.2500, 77.2000, 80.9330, 72.8500, 77.5833, 85…
## $ Latitude      <dbl> 26.3000, 13.0667, 28.5833, 26.8667, 19.1167, 12.9667, 20…
## $ Elevation     <int> 217, 6, 211, 110, 8, 920, NA, NA
## $ Location_Name <chr> "Bangalore", "Chennai", "Delhi", "Lucknow", "Mumbai", "R…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Weather_Station)
## [1] "longitude"     "Latitude"      "Elevation"     "Location_Name"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Station)
## 'data.frame':    8 obs. of  4 variables:
##  $ longitude    : num  73 80.2 77.2 80.9 72.8 ...
##  $ Latitude     : num  26.3 13.1 28.6 26.9 19.1 ...
##  $ Elevation    : int  217 6 211 110 8 920 NA NA
##  $ Location_Name: chr  "Bangalore" "Chennai" "Delhi" "Lucknow" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Station)
##    longitude        Latitude       Elevation     Location_Name     
##  Min.   :72.85   Min.   :12.97   Min.   :  6.0   Length:8          
##  1st Qu.:76.15   1st Qu.:17.60   1st Qu.: 33.5   Class :character  
##  Median :78.92   Median :21.23   Median :160.5   Mode  :character  
##  Mean   :79.07   Mean   :21.17   Mean   :245.3                     
##  3rd Qu.:81.92   3rd Qu.:26.44   3rd Qu.:215.5                     
##  Max.   :85.83   Max.   :28.58   Max.   :920.0                     
##                                  NA's   :2
attach(Weather_Station)
## Only Elevation seems to have some missing data, lets zoom into them
Weather_Station[is.na(Elevation),]
##   longitude Latitude Elevation Location_Name
## 7   85.8333  20.2500        NA   Bubhneshwar
## 8   84.8833  22.2167        NA      Rourkela
## Nothing special about why Bubhneshwar and Rourkela alone seems to have elevation missing
## No cleaning needed as there are no plans to make use of the elevation data of the stations

## To find outliers, draw a histogram

Bangalore_1990_2022_BangaloreCity.csv

## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Bangalore)
## [1] 11894     5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Bangalore)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 22.9, 21.7, 21.0, 20.8, 20.4, 20.4, 18.8, 20.0, 21.0, 21.2, 21.8,…
## $ tmin <dbl> 19.1, NA, 16.4, NA, 14.2, 17.1, NA, 16.6, 15.5, 15.0, 16.0, 13.2,…
## $ tmax <dbl> 28.4, 26.5, 26.5, 27.4, 26.1, 24.2, 20.5, 25.1, NA, 27.7, 28.5, N…
## $ prcp <dbl> NA, 0, 0, 0, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Bangalore)
##         time tavg tmin tmax prcp
## 1 01-01-1990 22.9 19.1 28.4   NA
## 2 02-01-1990 21.7   NA 26.5    0
## 3 03-01-1990 21.0 16.4 26.5    0
## 4 04-01-1990 20.8   NA 27.4    0
## 5 05-01-1990 20.4 14.2 26.1    0
## 6 06-01-1990 20.4 17.1 24.2   NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Bangalore)
##             time tavg tmin tmax prcp
## 11889 20-07-2022 24.9 19.8 30.8  0.0
## 11890 21-07-2022 23.7 20.5 30.8 82.5
## 11891 22-07-2022 23.2 21.1 27.9  0.0
## 11892 23-07-2022 23.1 20.9 26.7  0.0
## 11893 24-07-2022 22.8 20.0 26.7  0.3
## 11894 25-07-2022 24.1 20.2 28.5  0.5
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Bangalore)

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Bangalore)
## 'data.frame':    11894 obs. of  5 variables:
##  $ time: chr  "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
##  $ tavg: num  22.9 21.7 21 20.8 20.4 20.4 18.8 20 21 21.2 ...
##  $ tmin: num  19.1 NA 16.4 NA 14.2 17.1 NA 16.6 15.5 15 ...
##  $ tmax: num  28.4 26.5 26.5 27.4 26.1 24.2 20.5 25.1 NA 27.7 ...
##  $ prcp: num  NA 0 0 0 0 NA NA 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Bangalore)
##      time                tavg            tmin            tmax      
##  Length:11894       Min.   :17.20   Min.   : 9.30   Min.   :19.80  
##  Class :character   1st Qu.:22.30   1st Qu.:18.10   1st Qu.:27.90  
##  Mode  :character   Median :23.50   Median :19.80   Median :29.50  
##                     Mean   :23.84   Mean   :19.39   Mean   :29.93  
##                     3rd Qu.:25.20   3rd Qu.:20.80   3rd Qu.:32.00  
##                     Max.   :32.40   Max.   :27.90   Max.   :39.20  
##                     NA's   :70      NA's   :1389    NA's   :629    
##       prcp        
##  Min.   :  0.000  
##  1st Qu.:  0.000  
##  Median :  0.000  
##  Mean   :  4.414  
##  3rd Qu.:  2.000  
##  Max.   :271.300  
##  NA's   :4620
## Lets see the table with values for missing time
sum(is.na(Weather_Bangalore))
## [1] 6708
## Ok there are about 6708 NAs

Chennai_1990_2022_Madras.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Chennai)
## [1] 11894     5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Chennai)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 25.2, 24.9, 25.6, 25.7, 25.5, 24.7, 25.4, 25.6, 24.8, 24.7, 24.5,…
## $ tmin <dbl> 22.8, 21.7, 21.4, NA, 20.7, NA, 23.3, 22.0, 21.7, 20.7, 20.0, 18.…
## $ tmax <dbl> 28.4, 29.1, 29.8, 28.7, 28.4, 26.1, 27.0, 28.0, 28.5, 29.0, 28.8,…
## $ prcp <dbl> 0.5, 0.0, 0.0, 0.0, 0.0, 0.5, 18.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Chennai)
##         time tavg tmin tmax prcp
## 1 01-01-1990 25.2 22.8 28.4  0.5
## 2 02-01-1990 24.9 21.7 29.1  0.0
## 3 03-01-1990 25.6 21.4 29.8  0.0
## 4 04-01-1990 25.7   NA 28.7  0.0
## 5 05-01-1990 25.5 20.7 28.4  0.0
## 6 06-01-1990 24.7   NA 26.1  0.5
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Chennai)
##             time tavg tmin tmax prcp
## 11889 20-07-2022 28.9 26.2 33.0  9.3
## 11890 21-07-2022 28.4 24.5 32.8 21.1
## 11891 22-07-2022 27.8 24.6 32.2 22.1
## 11892 23-07-2022 27.4 24.7 32.6 18.6
## 11893 24-07-2022 27.8 25.0 33.3  9.1
## 11894 25-07-2022 28.1 25.4 32.6  2.9
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Chennai)

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Chennai)
## 'data.frame':    11894 obs. of  5 variables:
##  $ time: chr  "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
##  $ tavg: num  25.2 24.9 25.6 25.7 25.5 24.7 25.4 25.6 24.8 24.7 ...
##  $ tmin: num  22.8 21.7 21.4 NA 20.7 NA 23.3 22 21.7 20.7 ...
##  $ tmax: num  28.4 29.1 29.8 28.7 28.4 26.1 27 28 28.5 29 ...
##  $ prcp: num  0.5 0 0 0 0 0.5 18 0.5 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Chennai)
##      time                tavg            tmin            tmax      
##  Length:11894       Min.   :20.90   Min.   :12.00   Min.   :23.80  
##  Class :character   1st Qu.:26.30   1st Qu.:22.60   1st Qu.:31.10  
##  Mode  :character   Median :28.70   Median :24.60   Median :34.00  
##                     Mean   :28.49   Mean   :24.38   Mean   :33.91  
##                     3rd Qu.:30.40   3rd Qu.:26.40   3rd Qu.:36.20  
##                     Max.   :36.60   Max.   :31.00   Max.   :44.60  
##                     NA's   :27      NA's   :3084    NA's   :1019   
##       prcp        
##  Min.   :  0.000  
##  1st Qu.:  0.000  
##  Median :  0.000  
##  Mean   :  6.244  
##  3rd Qu.:  3.000  
##  Max.   :344.900  
##  NA's   :4886
sum(is.na(Weather_Chennai))
## [1] 9016
## About 9016 entries are NA

Delhi_NCR_1990_2022_Safdarjung.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Delhi)
## [1] 11894     5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Delhi)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 9.4, 9.3, 9.0, 10.7, 12.6, 14.9, 14.4, 10.7, 13.4, 16.6, 17.0, 17…
## $ tmin <dbl> 6.0, 5.2, 6.5, 6.0, 7.3, 8.1, 8.1, 8.5, 7.0, NA, 10.9, 9.8, 8.8, …
## $ tmax <dbl> 15.1, 14.2, 13.6, 17.5, 20.8, 22.9, 21.4, 16.6, 20.6, 22.8, 25.3,…
## $ prcp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Delhi)
##         time tavg tmin tmax prcp
## 1 01-01-1990  9.4  6.0 15.1    0
## 2 02-01-1990  9.3  5.2 14.2    0
## 3 03-01-1990  9.0  6.5 13.6    0
## 4 04-01-1990 10.7  6.0 17.5    0
## 5 05-01-1990 12.6  7.3 20.8    0
## 6 06-01-1990 14.9  8.1 22.9    0
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Delhi)
##             time tavg tmin tmax prcp
## 11889 20-07-2022 30.1 26.5 33.2 14.7
## 11890 21-07-2022 28.6 26.8 30.6 21.2
## 11891 22-07-2022 29.3 27.0 32.9  0.3
## 11892 23-07-2022 30.1 25.5 34.9  8.9
## 11893 24-07-2022 30.6 27.1 35.7  0.0
## 11894 25-07-2022 30.7 26.8 35.7  0.0
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Delhi)

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Delhi)
## 'data.frame':    11894 obs. of  5 variables:
##  $ time: chr  "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
##  $ tavg: num  9.4 9.3 9 10.7 12.6 14.9 14.4 10.7 13.4 16.6 ...
##  $ tmin: num  6 5.2 6.5 6 7.3 8.1 8.1 8.5 7 NA ...
##  $ tmax: num  15.1 14.2 13.6 17.5 20.8 22.9 21.4 16.6 20.6 22.8 ...
##  $ prcp: num  0 0 0 0 0 0 0 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Delhi)
##      time                tavg           tmin            tmax      
##  Length:11894       Min.   : 6.6   Min.   : 0.10   Min.   : 9.80  
##  Class :character   1st Qu.:18.5   1st Qu.:11.80   1st Qu.:26.70  
##  Mode  :character   Median :27.0   Median :20.00   Median :33.20  
##                     Mean   :25.0   Mean   :18.88   Mean   :31.79  
##                     3rd Qu.:30.9   3rd Qu.:26.00   3rd Qu.:36.60  
##                     Max.   :39.8   Max.   :34.20   Max.   :48.10  
##                     NA's   :94     NA's   :1536    NA's   :533    
##       prcp        
##  Min.   :  0.000  
##  1st Qu.:  0.000  
##  Median :  0.000  
##  Mean   :  3.662  
##  3rd Qu.:  0.500  
##  Max.   :262.900  
##  NA's   :6140
sum(is.na(Weather_Delhi))
## [1] 8303
## About 8303 entries are NA

Lucknow_1990_2022.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Lucknow)
## [1] 11894     5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Lucknow)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 7.2, 10.5, 10.2, 9.1, 13.5, 11.5, 14.2, 17.1, 11.1, 14.8, 12.9, 1…
## $ tmin <dbl> NA, NA, 1.8, NA, NA, 5.9, 5.4, NA, NA, 4.1, 5.1, 7.3, NA, 6.9, 9.…
## $ tmax <dbl> 18.1, 17.2, 18.6, 19.3, 23.8, 21.4, 23.6, 24.6, 24.6, 23.6, 23.6,…
## $ prcp <dbl> 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, …
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Lucknow)
##         time tavg tmin tmax prcp
## 1 01-01-1990  7.2   NA 18.1    0
## 2 02-01-1990 10.5   NA 17.2    0
## 3 03-01-1990 10.2  1.8 18.6   NA
## 4 04-01-1990  9.1   NA 19.3    0
## 5 05-01-1990 13.5   NA 23.8    0
## 6 06-01-1990 11.5  5.9 21.4    0
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Lucknow)
##             time tavg tmin tmax prcp
## 11889 20-07-2022 28.6 25.1 33.1 17.7
## 11890 21-07-2022 27.4 25.1 33.1 27.3
## 11891 22-07-2022 28.1 26.1 31.1 16.0
## 11892 23-07-2022 30.3 26.2 34.7 11.9
## 11893 24-07-2022 30.0 28.1 34.7  2.0
## 11894 25-07-2022 27.1 24.1 34.3  0.5
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Lucknow)

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Lucknow)
## 'data.frame':    11894 obs. of  5 variables:
##  $ time: chr  "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
##  $ tavg: num  7.2 10.5 10.2 9.1 13.5 11.5 14.2 17.1 11.1 14.8 ...
##  $ tmin: num  NA NA 1.8 NA NA 5.9 5.4 NA NA 4.1 ...
##  $ tmax: num  18.1 17.2 18.6 19.3 23.8 21.4 23.6 24.6 24.6 23.6 ...
##  $ prcp: num  0 0 NA 0 0 0 0 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Lucknow)
##      time                tavg            tmin           tmax      
##  Length:11894       Min.   : 5.70   Min.   :-0.6   Min.   :11.10  
##  Class :character   1st Qu.:19.50   1st Qu.:12.5   1st Qu.:28.10  
##  Mode  :character   Median :27.20   Median :20.5   Median :33.40  
##                     Mean   :25.22   Mean   :18.8   Mean   :32.49  
##                     3rd Qu.:30.40   3rd Qu.:25.1   3rd Qu.:36.50  
##                     Max.   :39.70   Max.   :32.7   Max.   :47.30  
##                     NA's   :138     NA's   :3515   NA's   :1553   
##       prcp        
##  Min.   :  0.000  
##  1st Qu.:  0.000  
##  Median :  0.000  
##  Mean   :  4.536  
##  3rd Qu.:  1.000  
##  Max.   :470.900  
##  NA's   :6152
sum(is.na(Weather_Lucknow))
## [1] 11358
## About 11358 entries are NA

Mumbai_1990_2022_Santacruz.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Mumbai)
## [1] 11894     5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Mumbai)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 23.2, 22.2, 21.8, 25.4, 26.5, 25.1, 26.0, 26.6, 25.1, 26.8, 25.6,…
## $ tmin <dbl> 17.0, 16.5, 16.3, 17.9, 19.3, 19.8, 18.9, 18.8, 19.0, 19.3, 18.5,…
## $ tmax <dbl> NA, 29.9, 30.7, 31.8, 33.7, 33.5, 33.7, 34.6, 34.4, 34.7, 34.0, 3…
## $ prcp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Mumbai)
##         time tavg tmin tmax prcp
## 1 01-01-1990 23.2 17.0   NA    0
## 2 02-01-1990 22.2 16.5 29.9    0
## 3 03-01-1990 21.8 16.3 30.7    0
## 4 04-01-1990 25.4 17.9 31.8    0
## 5 05-01-1990 26.5 19.3 33.7    0
## 6 06-01-1990 25.1 19.8 33.5    0
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Mumbai)
##             time tavg tmin tmax prcp
## 11889 20-07-2022 27.4 25.0 30.5 11.9
## 11890 21-07-2022 27.6 25.6 30.5 10.9
## 11891 22-07-2022 28.3 26.0 30.5  3.0
## 11892 23-07-2022 28.2 25.8 31.3  5.1
## 11893 24-07-2022 28.1 25.6 30.4  7.1
## 11894 25-07-2022 28.3 25.1 30.2  7.1
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Mumbai)

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Mumbai)
## 'data.frame':    11894 obs. of  5 variables:
##  $ time: chr  "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
##  $ tavg: num  23.2 22.2 21.8 25.4 26.5 25.1 26 26.6 25.1 26.8 ...
##  $ tmin: num  17 16.5 16.3 17.9 19.3 19.8 18.9 18.8 19 19.3 ...
##  $ tmax: num  NA 29.9 30.7 31.8 33.7 33.5 33.7 34.6 34.4 34.7 ...
##  $ prcp: num  0 0 0 0 0 0 0 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Mumbai)
##      time                tavg            tmin            tmax      
##  Length:11894       Min.   :17.70   Min.   : 8.50   Min.   :22.30  
##  Class :character   1st Qu.:26.60   1st Qu.:19.80   1st Qu.:30.90  
##  Mode  :character   Median :28.10   Median :23.70   Median :32.40  
##                     Mean   :27.76   Mean   :22.62   Mean   :32.31  
##                     3rd Qu.:29.30   3rd Qu.:25.40   3rd Qu.:33.90  
##                     Max.   :33.70   Max.   :30.40   Max.   :41.30  
##                     NA's   :11      NA's   :2454    NA's   :1907   
##       prcp       
##  Min.   :  0.00  
##  1st Qu.:  0.00  
##  Median :  0.00  
##  Mean   : 10.94  
##  3rd Qu.:  7.10  
##  Max.   :461.00  
##  NA's   :4681
sum(is.na(Weather_Mumbai))
## [1] 9053
## About 9053 entries are NA

Rajasthan_1990_2022_Jodhpur.csv

## Have a look at the data


print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Jodhpur)
## [1] 11894     5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Jodhpur)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 22.9, 21.7, 21.0, 20.8, 20.4, 20.4, 18.8, 20.0, 21.0, 21.2, 21.8,…
## $ tmin <dbl> 19.1, NA, 16.4, NA, 14.2, 17.1, NA, 16.6, 15.5, 15.0, 16.0, 13.2,…
## $ tmax <dbl> 28.4, 26.5, 26.5, 27.4, 26.1, 24.2, 20.5, 25.1, NA, 27.7, 28.5, N…
## $ prcp <dbl> NA, 0, 0, 0, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Jodhpur)
##         time tavg tmin tmax prcp
## 1 01-01-1990 22.9 19.1 28.4   NA
## 2 02-01-1990 21.7   NA 26.5    0
## 3 03-01-1990 21.0 16.4 26.5    0
## 4 04-01-1990 20.8   NA 27.4    0
## 5 05-01-1990 20.4 14.2 26.1    0
## 6 06-01-1990 20.4 17.1 24.2   NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Jodhpur)
##             time tavg tmin tmax prcp
## 11889 20-07-2022 24.9 19.8 30.8  0.0
## 11890 21-07-2022 23.7 20.5 30.8 82.5
## 11891 22-07-2022 23.2 21.1 27.9  0.0
## 11892 23-07-2022 23.1 20.9 26.7  0.0
## 11893 24-07-2022 22.8 20.0 26.7  0.3
## 11894 25-07-2022 24.1 20.2 28.5  0.5
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Jodhpur)

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Jodhpur)
## 'data.frame':    11894 obs. of  5 variables:
##  $ time: chr  "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
##  $ tavg: num  22.9 21.7 21 20.8 20.4 20.4 18.8 20 21 21.2 ...
##  $ tmin: num  19.1 NA 16.4 NA 14.2 17.1 NA 16.6 15.5 15 ...
##  $ tmax: num  28.4 26.5 26.5 27.4 26.1 24.2 20.5 25.1 NA 27.7 ...
##  $ prcp: num  NA 0 0 0 0 NA NA 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Jodhpur)
##      time                tavg            tmin            tmax      
##  Length:11894       Min.   :17.20   Min.   : 9.30   Min.   :19.80  
##  Class :character   1st Qu.:22.30   1st Qu.:18.10   1st Qu.:27.90  
##  Mode  :character   Median :23.50   Median :19.80   Median :29.50  
##                     Mean   :23.84   Mean   :19.39   Mean   :29.93  
##                     3rd Qu.:25.20   3rd Qu.:20.80   3rd Qu.:32.00  
##                     Max.   :32.40   Max.   :27.90   Max.   :39.20  
##                     NA's   :70      NA's   :1389    NA's   :629    
##       prcp        
##  Min.   :  0.000  
##  1st Qu.:  0.000  
##  Median :  0.000  
##  Mean   :  4.414  
##  3rd Qu.:  2.000  
##  Max.   :271.300  
##  NA's   :4620
sum(is.na(Weather_Jodhpur))
## [1] 6708
## About 6708 entries are NA

weather_Bhubhneshwar_1990_2022.csv")

## Have a look at the data

#definitely has more columns than the cities that we have seen so far

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Bhubhneshwar)
## [1] 11935    11
#OK, so we have 11 columns, 6 more than others

print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Bhubhneshwar)
## Rows: 11,935
## Columns: 11
## $ time <chr> "1990-01-01", "1990-01-02", "1990-01-03", "1990-01-04", "1990-01-…
## $ tavg <dbl> 20.1, 20.7, 20.7, 18.8, 19.8, 22.2, 20.8, 20.3, 22.3, 21.6, 21.7,…
## $ tmin <dbl> NA, 16.4, 16.0, NA, 11.0, 12.5, NA, 13.6, 14.8, 14.5, 15.6, 12.8,…
## $ tmax <dbl> 28.0, NA, 27.4, 28.0, 28.2, NA, NA, 29.5, 31.6, 30.8, 30.7, 29.3,…
## $ prcp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, NA, 0, 0, 0, …
## $ snow <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wdir <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wspd <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wpgt <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ pres <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ tsun <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Bhubhneshwar)
##         time tavg tmin tmax prcp snow wdir wspd wpgt pres tsun
## 1 1990-01-01 20.1   NA 28.0    0   NA   NA   NA   NA   NA   NA
## 2 1990-01-02 20.7 16.4   NA    0   NA   NA   NA   NA   NA   NA
## 3 1990-01-03 20.7 16.0 27.4    0   NA   NA   NA   NA   NA   NA
## 4 1990-01-04 18.8   NA 28.0    0   NA   NA   NA   NA   NA   NA
## 5 1990-01-05 19.8 11.0 28.2    0   NA   NA   NA   NA   NA   NA
## 6 1990-01-06 22.2 12.5   NA    0   NA   NA   NA   NA   NA   NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Bhubhneshwar)
##             time tavg tmin tmax prcp snow wdir wspd wpgt   pres tsun
## 11930 2022-08-30 30.0 27.0 34.0  1.2   NA  169  8.3   NA 1007.6   NA
## 11931 2022-08-31 29.2 26.3 33.0  9.0   NA  186  8.2   NA 1006.6   NA
## 11932 2022-09-01 29.6 27.0 33.0  2.1   NA  190  9.5   NA 1006.8   NA
## 11933 2022-09-02 29.7 26.3 33.0  3.3   NA  198  9.5   NA 1007.3   NA
## 11934 2022-09-03 29.2 26.1 34.0  9.7   NA  215  8.5   NA 1005.5   NA
## 11935 2022-09-04 27.6 25.9 31.6 12.8   NA  214  8.6   NA 1004.9   NA
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Weather_Bhubhneshwar)
##  [1] "time" "tavg" "tmin" "tmax" "prcp" "snow" "wdir" "wspd" "wpgt" "pres"
## [11] "tsun"
## So the additional columns are: snow, wind direction, wind speed, wind pgt, pressure and tsunami

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Bhubhneshwar)
## 'data.frame':    11935 obs. of  11 variables:
##  $ time: chr  "1990-01-01" "1990-01-02" "1990-01-03" "1990-01-04" ...
##  $ tavg: num  20.1 20.7 20.7 18.8 19.8 22.2 20.8 20.3 22.3 21.6 ...
##  $ tmin: num  NA 16.4 16 NA 11 12.5 NA 13.6 14.8 14.5 ...
##  $ tmax: num  28 NA 27.4 28 28.2 NA NA 29.5 31.6 30.8 ...
##  $ prcp: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ snow: logi  NA NA NA NA NA NA ...
##  $ wdir: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ wspd: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ wpgt: logi  NA NA NA NA NA NA ...
##  $ pres: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tsun: logi  NA NA NA NA NA NA ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Bhubhneshwar)
##      time                tavg            tmin            tmax     
##  Length:11935       Min.   :15.70   Min.   : 8.20   Min.   :19.4  
##  Class :character   1st Qu.:24.70   1st Qu.:19.00   1st Qu.:30.4  
##  Mode  :character   Median :27.70   Median :24.00   Median :32.8  
##                     Mean   :26.99   Mean   :22.24   Mean   :33.0  
##                     3rd Qu.:29.40   3rd Qu.:25.60   3rd Qu.:35.4  
##                     Max.   :37.40   Max.   :31.80   Max.   :46.7  
##                     NA's   :78      NA's   :2090    NA's   :891   
##       prcp           snow              wdir            wspd       
##  Min.   :  0.000   Mode:logical   Min.   :  0.0   Min.   : 0.500  
##  1st Qu.:  0.000   NA's:11935     1st Qu.: 89.0   1st Qu.: 4.500  
##  Median :  0.000                  Median :188.0   Median : 7.000  
##  Mean   :  7.074                  Mean   :169.1   Mean   : 8.399  
##  3rd Qu.:  4.100                  3rd Qu.:220.8   3rd Qu.:11.000  
##  Max.   :470.900                  Max.   :359.0   Max.   :33.100  
##  NA's   :5097                     NA's   :10641   NA's   :9806    
##    wpgt              pres          tsun        
##  Mode:logical   Min.   : 990.6   Mode:logical  
##  NA's:11935     1st Qu.:1002.9   NA's:11935    
##                 Median :1007.3                 
##                 Mean   :1007.4                 
##                 3rd Qu.:1012.4                 
##                 Max.   :1019.3                 
##                 NA's   :10692
sum(is.na(Weather_Bhubhneshwar))
## [1] 75100
## About 75100 entries are NA

weather_Rourkela_2021_2022.csv")

## Have a look at the data
#definitely has more columns than the cities that we have seen so far

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Rourkela)
## [1] 426  11
#OK, so we have 11 columns, 6 more than others

print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Rourkela)
## Rows: 426
## Columns: 11
## $ time <chr> "2021-07-06", "2021-07-07", "2021-07-08", "2021-07-09", "2021-07-…
## $ tavg <dbl> 29.3, 29.7, 27.4, 28.5, 29.0, 29.3, 28.9, 28.6, 29.0, 29.5, 29.6,…
## $ tmin <dbl> 26.2, 27.3, 25.8, 26.1, 26.2, 26.2, 25.7, 25.5, 25.4, 25.5, 26.3,…
## $ tmax <dbl> 32.6, 33.4, 29.7, 32.1, 32.6, 33.7, 32.9, 32.5, 32.7, 33.4, 33.2,…
## $ prcp <dbl> NA, 11.1, 66.9, 11.4, 2.7, 10.8, 5.4, 10.1, 1.9, 1.3, 1.1, 6.0, 8…
## $ snow <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wdir <dbl> 197, 199, 186, 173, 121, 70, 95, 101, 138, 152, 179, 181, 181, 19…
## $ wspd <dbl> 6.8, 6.9, 6.3, 3.9, 4.6, 5.8, 7.0, 5.5, 6.5, 8.7, 9.5, 8.3, 8.0, …
## $ wpgt <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ pres <dbl> 1002.5, 1002.2, 1001.8, 1001.0, 1000.9, 1002.2, 1003.4, 1002.8, 1…
## $ tsun <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Rourkela)
##         time tavg tmin tmax prcp snow wdir wspd wpgt   pres tsun
## 1 2021-07-06 29.3 26.2 32.6   NA   NA  197  6.8   NA 1002.5   NA
## 2 2021-07-07 29.7 27.3 33.4 11.1   NA  199  6.9   NA 1002.2   NA
## 3 2021-07-08 27.4 25.8 29.7 66.9   NA  186  6.3   NA 1001.8   NA
## 4 2021-07-09 28.5 26.1 32.1 11.4   NA  173  3.9   NA 1001.0   NA
## 5 2021-07-10 29.0 26.2 32.6  2.7   NA  121  4.6   NA 1000.9   NA
## 6 2021-07-11 29.3 26.2 33.7 10.8   NA   70  5.8   NA 1002.2   NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Rourkela)
##           time tavg tmin tmax prcp snow wdir wspd wpgt   pres tsun
## 421 2022-08-30 29.8 26.4 34.3  0.0   NA  174  7.6   NA 1007.9   NA
## 422 2022-08-31 29.0 26.6 33.5  2.0   NA  187  8.6   NA 1006.8   NA
## 423 2022-09-01 29.1 25.7 33.2 11.5   NA  205  6.7   NA 1007.2   NA
## 424 2022-09-02 29.4 26.4 33.7  1.5   NA  189  7.0   NA 1007.5   NA
## 425 2022-09-03 28.7 26.6 32.6  8.0   NA  203  8.0   NA 1005.8   NA
## 426 2022-09-04 28.2 25.9 31.8 17.7   NA  211  6.8   NA 1004.8   NA
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Weather_Rourkela)
##  [1] "time" "tavg" "tmin" "tmax" "prcp" "snow" "wdir" "wspd" "wpgt" "pres"
## [11] "tsun"
## So the additional columns are: snow, wind direction, wind speed, wind pgt, pressure and tsunami

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Rourkela)
## 'data.frame':    426 obs. of  11 variables:
##  $ time: chr  "2021-07-06" "2021-07-07" "2021-07-08" "2021-07-09" ...
##  $ tavg: num  29.3 29.7 27.4 28.5 29 29.3 28.9 28.6 29 29.5 ...
##  $ tmin: num  26.2 27.3 25.8 26.1 26.2 26.2 25.7 25.5 25.4 25.5 ...
##  $ tmax: num  32.6 33.4 29.7 32.1 32.6 33.7 32.9 32.5 32.7 33.4 ...
##  $ prcp: num  NA 11.1 66.9 11.4 2.7 10.8 5.4 10.1 1.9 1.3 ...
##  $ snow: logi  NA NA NA NA NA NA ...
##  $ wdir: num  197 199 186 173 121 70 95 101 138 152 ...
##  $ wspd: num  6.8 6.9 6.3 3.9 4.6 5.8 7 5.5 6.5 8.7 ...
##  $ wpgt: logi  NA NA NA NA NA NA ...
##  $ pres: num  1002 1002 1002 1001 1001 ...
##  $ tsun: logi  NA NA NA NA NA NA ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Rourkela)
##      time                tavg            tmin            tmax      
##  Length:426         Min.   :14.60   Min.   : 8.20   Min.   :21.50  
##  Class :character   1st Qu.:24.40   1st Qu.:18.18   1st Qu.:29.60  
##  Mode  :character   Median :28.10   Median :25.20   Median :32.10  
##                     Mean   :26.71   Mean   :22.30   Mean   :32.25  
##                     3rd Qu.:29.30   3rd Qu.:26.10   3rd Qu.:33.80  
##                     Max.   :35.00   Max.   :29.30   Max.   :43.60  
##                     NA's   :2       NA's   :2       NA's   :2      
##       prcp           snow              wdir            wspd       
##  Min.   :  0.000   Mode:logical   Min.   :  0.0   Min.   : 2.900  
##  1st Qu.:  0.000   NA's:426       1st Qu.: 49.0   1st Qu.: 5.500  
##  Median :  0.200                  Median :168.0   Median : 6.600  
##  Mean   :  5.695                  Mean   :140.3   Mean   : 7.441  
##  3rd Qu.:  7.200                  3rd Qu.:195.2   3rd Qu.: 8.725  
##  Max.   :123.000                  Max.   :359.0   Max.   :20.400  
##  NA's   :3                        NA's   :2       NA's   :2       
##    wpgt              pres          tsun        
##  Mode:logical   Min.   : 993.1   Mode:logical  
##  NA's:426       1st Qu.:1002.5   NA's:426      
##                 Median :1005.5                 
##                 Mean   :1006.8                 
##                 3rd Qu.:1012.1                 
##                 Max.   :1020.6                 
##                 NA's   :2
sum(is.na(Weather_Rourkela))
## [1] 1293
## About 1293 entries are NA

AQI stations: stations.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_stations)
## [1] 230   5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_stations)
## Rows: 230
## Columns: 5
## $ StationId   <chr> "AP001", "AP002", "AP003", "AP004", "AP005", "AS001", "BR0…
## $ StationName <chr> "Secretariat, Amaravati - APPCB", "Anand Kala Kshetram, Ra…
## $ City        <chr> "Amaravati", "Rajamahendravaram", "Tirupati", "Vijayawada"…
## $ State       <chr> "Andhra Pradesh", "Andhra Pradesh", "Andhra Pradesh", "And…
## $ Status      <chr> "Active", "", "", "", "Active", "Active", "", "", "", "", …
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_stations)
## [1] "StationId"   "StationName" "City"        "State"       "Status"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_stations)
## 'data.frame':    230 obs. of  5 variables:
##  $ StationId  : chr  "AP001" "AP002" "AP003" "AP004" ...
##  $ StationName: chr  "Secretariat, Amaravati - APPCB" "Anand Kala Kshetram, Rajamahendravaram - APPCB" "Tirumala, Tirupati - APPCB" "PWD Grounds, Vijayawada - APPCB" ...
##  $ City       : chr  "Amaravati" "Rajamahendravaram" "Tirupati" "Vijayawada" ...
##  $ State      : chr  "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" ...
##  $ Status     : chr  "Active" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_stations)
##   StationId         StationName            City              State          
##  Length:230         Length:230         Length:230         Length:230        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##     Status         
##  Length:230        
##  Class :character  
##  Mode  :character
attach(AQ_stations)

AQ_stations [AQ_stations == ""] <- NA
## There is no records with NA but there are records with missing data.
## Lets fill them with NA and then find it.
AQ_stations[is.na(Status),]
## [1] StationId   StationName City        State       Status     
## <0 rows> (or 0-length row.names)

AQI Station Hour wise - station_hour.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_station_hour)
## [1] 2589083      16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_station_hour)
## Rows: 2,589,083
## Columns: 16
## $ StationId  <chr> "AP001", "AP001", "AP001", "AP001", "AP001", "AP001", "AP00…
## $ Datetime   <chr> "2017-11-24 17:00:00", "2017-11-24 18:00:00", "2017-11-24 1…
## $ PM2.5      <dbl> 60.50, 65.50, 80.00, 81.50, 75.25, 69.25, 67.50, 68.00, 73.…
## $ PM10       <dbl> 98.00, 111.25, 132.00, 133.25, 116.00, 108.25, 111.50, 111.…
## $ NO         <dbl> 2.35, 2.70, 2.10, 1.95, 1.43, 0.70, 1.05, 1.25, 0.30, 0.80,…
## $ NO2        <dbl> 30.80, 24.20, 25.18, 16.25, 17.48, 18.47, 12.15, 14.12, 14.…
## $ NOx        <dbl> 18.25, 15.07, 15.15, 10.23, 10.43, 10.38, 7.30, 8.50, 7.90,…
## $ NH3        <dbl> 8.50, 9.77, 12.02, 11.58, 12.03, 13.80, 17.65, 20.28, 11.50…
## $ CO         <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.1, 0.1, 0.1,…
## $ SO2        <dbl> 11.85, 13.17, 12.08, 10.47, 9.12, 9.25, 9.40, 8.90, 11.80, …
## $ O3         <dbl> 126.40, 117.12, 98.98, 112.20, 106.35, 91.10, 112.70, 116.1…
## $ Benzene    <dbl> 0.10, 0.10, 0.20, 0.20, 0.20, 0.20, 0.20, 0.20, 0.20, 0.23,…
## $ Toluene    <dbl> 6.10, 6.25, 5.98, 6.72, 5.75, 5.02, 5.60, 5.55, 6.60, 6.77,…
## $ Xylene     <dbl> 0.10, 0.15, 0.18, 0.10, 0.08, 0.00, 0.10, 0.05, 0.00, 0.10,…
## $ AQI        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ AQI_Bucket <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_station_hour)
##  [1] "StationId"  "Datetime"   "PM2.5"      "PM10"       "NO"        
##  [6] "NO2"        "NOx"        "NH3"        "CO"         "SO2"       
## [11] "O3"         "Benzene"    "Toluene"    "Xylene"     "AQI"       
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_station_hour)
## 'data.frame':    2589083 obs. of  16 variables:
##  $ StationId : chr  "AP001" "AP001" "AP001" "AP001" ...
##  $ Datetime  : chr  "2017-11-24 17:00:00" "2017-11-24 18:00:00" "2017-11-24 19:00:00" "2017-11-24 20:00:00" ...
##  $ PM2.5     : num  60.5 65.5 80 81.5 75.2 ...
##  $ PM10      : num  98 111 132 133 116 ...
##  $ NO        : num  2.35 2.7 2.1 1.95 1.43 0.7 1.05 1.25 0.3 0.8 ...
##  $ NO2       : num  30.8 24.2 25.2 16.2 17.5 ...
##  $ NOx       : num  18.2 15.1 15.2 10.2 10.4 ...
##  $ NH3       : num  8.5 9.77 12.02 11.58 12.03 ...
##  $ CO        : num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.3 0.1 ...
##  $ SO2       : num  11.85 13.17 12.08 10.47 9.12 ...
##  $ O3        : num  126 117 99 112 106 ...
##  $ Benzene   : num  0.1 0.1 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.23 ...
##  $ Toluene   : num  6.1 6.25 5.98 6.72 5.75 5.02 5.6 5.55 6.6 6.77 ...
##  $ Xylene    : num  0.1 0.15 0.18 0.1 0.08 0 0.1 0.05 0 0.1 ...
##  $ AQI       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AQI_Bucket: chr  "" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_station_hour)
##   StationId           Datetime             PM2.5             PM10        
##  Length:2589083     Length:2589083     Min.   :   0.0   Min.   :   0.0   
##  Class :character   Class :character   1st Qu.:  28.2   1st Qu.:  64.0   
##  Mode  :character   Mode  :character   Median :  52.6   Median : 116.2   
##                                        Mean   :  80.9   Mean   : 158.5   
##                                        3rd Qu.:  97.7   3rd Qu.: 204.0   
##                                        Max.   :1000.0   Max.   :1000.0   
##                                        NA's   :647689   NA's   :1119252  
##        NO              NO2              NOx              NH3         
##  Min.   :  0.0    Min.   :  0.0    Min.   :  0.0    Min.   :  0.0    
##  1st Qu.:  3.0    1st Qu.: 13.1    1st Qu.: 11.3    1st Qu.: 11.2    
##  Median :  7.2    Median : 24.8    Median : 22.9    Median : 22.4    
##  Mean   : 22.8    Mean   : 35.2    Mean   : 40.6    Mean   : 28.7    
##  3rd Qu.: 18.6    3rd Qu.: 45.5    3rd Qu.: 45.7    3rd Qu.: 37.8    
##  Max.   :500.0    Max.   :500.0    Max.   :500.0    Max.   :500.0    
##  NA's   :553711   NA's   :528973   NA's   :490808   NA's   :1236618  
##        CO              SO2               O3            Benzene      
##  Min.   :  0.0    Min.   :  0.0    Min.   :  0.0    Min.   :  0.0   
##  1st Qu.:  0.4    1st Qu.:  4.2    1st Qu.: 11.0    1st Qu.:  0.1   
##  Median :  0.8    Median :  8.2    Median : 24.8    Median :  1.0   
##  Mean   :  1.5    Mean   : 12.1    Mean   : 38.1    Mean   :  3.3   
##  3rd Qu.:  1.4    3rd Qu.: 14.5    3rd Qu.: 49.5    3rd Qu.:  3.2   
##  Max.   :498.6    Max.   :200.0    Max.   :997.0    Max.   :498.1   
##  NA's   :499302   NA's   :742737   NA's   :725973   NA's   :861579  
##     Toluene            Xylene             AQI          AQI_Bucket       
##  Min.   :  0.0     Min.   :  0.0     Min.   :   5.0   Length:2589083    
##  1st Qu.:  0.3     1st Qu.:  0.0     1st Qu.:  84.0   Class :character  
##  Median :  3.4     Median :  0.2     Median : 131.0   Mode  :character  
##  Mean   : 14.9     Mean   :  2.4     Mean   : 180.2                     
##  3rd Qu.: 15.1     3rd Qu.:  1.8     3rd Qu.: 259.0                     
##  Max.   :500.0     Max.   :500.0     Max.   :3133.0                     
##  NA's   :1042366   NA's   :2075104   NA's   :570190
attach(AQ_station_hour)
## The following object is masked from AQ_stations:
## 
##     StationId
AQ_station_hour [AQ_station_hour == ""] <- NA

## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:
## PM2.5:647689  PM10:1119252 NO:553711       NO2:528973      NOx:490808  CO:1236618 
## SO2:499302    O3:742737    Benzene:725973  Toluene:861579  Xylene:1042366

AQ_station_hour %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups:   AQI_Bucket [7]
##   AQI_Bucket        n
##   <chr>         <int>
## 1 Good         152113
## 2 Moderate     675008
## 3 Poor         239990
## 4 Satisfactory 530164
## 5 Severe       120468
## 6 Very Poor    301150
## 7 <NA>         570190
## Looks like Moderate entries are the highest ones but second highest is NA entries...

AQ_station_day - station_day.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_station_day)
## [1] 108035     16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_station_day)
## Rows: 108,035
## Columns: 16
## $ StationId  <chr> "AP001", "AP001", "AP001", "AP001", "AP001", "AP001", "AP00…
## $ Date       <chr> "2017-11-24", "2017-11-25", "2017-11-26", "2017-11-27", "20…
## $ PM2.5      <dbl> 71.36, 81.40, 78.32, 88.76, 64.18, 72.47, 69.80, 73.96, 89.…
## $ PM10       <dbl> 115.75, 124.50, 129.06, 135.32, 104.09, 114.84, 114.86, 113…
## $ NO         <dbl> 1.75, 1.44, 1.26, 6.60, 2.56, 5.23, 4.69, 4.58, 7.71, 0.97,…
## $ NO2        <dbl> 20.65, 20.50, 26.00, 30.85, 28.07, 23.20, 20.17, 19.29, 26.…
## $ NOx        <dbl> 12.40, 12.08, 14.85, 21.77, 17.01, 16.59, 14.54, 13.97, 19.…
## $ NH3        <dbl> 12.19, 10.72, 10.28, 12.91, 11.42, 12.25, 10.95, 10.95, 13.…
## $ CO         <dbl> 0.10, 0.12, 0.14, 0.11, 0.09, 0.16, 0.12, 0.10, 0.10, 0.15,…
## $ SO2        <dbl> 10.76, 15.24, 26.96, 33.59, 19.00, 10.55, 14.07, 13.90, 19.…
## $ O3         <dbl> 109.26, 127.09, 117.44, 111.81, 138.18, 109.74, 118.09, 123…
## $ Benzene    <dbl> 0.17, 0.20, 0.22, 0.29, 0.17, 0.21, 0.16, 0.17, 0.25, 0.23,…
## $ Toluene    <dbl> 5.92, 6.50, 7.95, 7.63, 5.02, 4.71, 3.52, 2.85, 2.79, 3.82,…
## $ Xylene     <dbl> 0.10, 0.06, 0.08, 0.12, 0.07, 0.08, 0.06, 0.04, 0.07, 0.04,…
## $ AQI        <dbl> NA, 184, 197, 198, 188, 173, 165, 191, 191, 227, 168, 198, …
## $ AQI_Bucket <chr> "", "Moderate", "Moderate", "Moderate", "Moderate", "Modera…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_station_day)
##  [1] "StationId"  "Date"       "PM2.5"      "PM10"       "NO"        
##  [6] "NO2"        "NOx"        "NH3"        "CO"         "SO2"       
## [11] "O3"         "Benzene"    "Toluene"    "Xylene"     "AQI"       
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_station_day)
## 'data.frame':    108035 obs. of  16 variables:
##  $ StationId : chr  "AP001" "AP001" "AP001" "AP001" ...
##  $ Date      : chr  "2017-11-24" "2017-11-25" "2017-11-26" "2017-11-27" ...
##  $ PM2.5     : num  71.4 81.4 78.3 88.8 64.2 ...
##  $ PM10      : num  116 124 129 135 104 ...
##  $ NO        : num  1.75 1.44 1.26 6.6 2.56 5.23 4.69 4.58 7.71 0.97 ...
##  $ NO2       : num  20.6 20.5 26 30.9 28.1 ...
##  $ NOx       : num  12.4 12.1 14.8 21.8 17 ...
##  $ NH3       : num  12.2 10.7 10.3 12.9 11.4 ...
##  $ CO        : num  0.1 0.12 0.14 0.11 0.09 0.16 0.12 0.1 0.1 0.15 ...
##  $ SO2       : num  10.8 15.2 27 33.6 19 ...
##  $ O3        : num  109 127 117 112 138 ...
##  $ Benzene   : num  0.17 0.2 0.22 0.29 0.17 0.21 0.16 0.17 0.25 0.23 ...
##  $ Toluene   : num  5.92 6.5 7.95 7.63 5.02 4.71 3.52 2.85 2.79 3.82 ...
##  $ Xylene    : num  0.1 0.06 0.08 0.12 0.07 0.08 0.06 0.04 0.07 0.04 ...
##  $ AQI       : num  NA 184 197 198 188 173 165 191 191 227 ...
##  $ AQI_Bucket: chr  "" "Moderate" "Moderate" "Moderate" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_station_day)
##   StationId             Date               PM2.5              PM10        
##  Length:108035      Length:108035      Min.   :   0.02   Min.   :   0.01  
##  Class :character   Class :character   1st Qu.:  31.88   1st Qu.:  70.15  
##  Mode  :character   Mode  :character   Median :  55.95   Median : 122.09  
##                                        Mean   :  80.27   Mean   : 157.97  
##                                        3rd Qu.:  99.92   3rd Qu.: 208.67  
##                                        Max.   :1000.00   Max.   :1000.00  
##                                        NA's   :21625     NA's   :42706    
##        NO              NO2              NOx              NH3        
##  Min.   :  0.01   Min.   :  0.01   Min.   :  0.00   Min.   :  0.01  
##  1st Qu.:  4.84   1st Qu.: 15.09   1st Qu.: 13.97   1st Qu.: 11.90  
##  Median : 10.29   Median : 27.21   Median : 26.66   Median : 23.59  
##  Mean   : 23.12   Mean   : 35.24   Mean   : 41.20   Mean   : 28.73  
##  3rd Qu.: 24.98   3rd Qu.: 46.93   3rd Qu.: 50.50   3rd Qu.: 38.14  
##  Max.   :470.00   Max.   :448.05   Max.   :467.63   Max.   :418.90  
##  NA's   :17106    NA's   :16547    NA's   :15500    NA's   :48105   
##        CO               SO2               O3            Benzene       
##  Min.   :  0.000   Min.   :  0.01   Min.   :  0.01   Min.   :  0.000  
##  1st Qu.:  0.530   1st Qu.:  5.04   1st Qu.: 18.89   1st Qu.:  0.160  
##  Median :  0.910   Median :  8.95   Median : 30.84   Median :  1.210  
##  Mean   :  1.606   Mean   : 12.26   Mean   : 38.13   Mean   :  3.358  
##  3rd Qu.:  1.450   3rd Qu.: 14.92   3rd Qu.: 47.14   3rd Qu.:  3.610  
##  Max.   :175.810   Max.   :195.65   Max.   :963.00   Max.   :455.030  
##  NA's   :12998     NA's   :25204    NA's   :25568    NA's   :31455    
##     Toluene           Xylene            AQI          AQI_Bucket       
##  Min.   :  0.00   Min.   :  0.00   Min.   :   8.0   Length:108035     
##  1st Qu.:  0.69   1st Qu.:  0.00   1st Qu.:  86.0   Class :character  
##  Median :  4.33   Median :  0.40   Median : 132.0   Mode  :character  
##  Mean   : 15.35   Mean   :  2.42   Mean   : 179.7                     
##  3rd Qu.: 17.51   3rd Qu.:  2.11   3rd Qu.: 254.0                     
##  Max.   :454.85   Max.   :170.37   Max.   :2049.0                     
##  NA's   :38702    NA's   :85137    NA's   :21010
attach(AQ_station_day)
## The following objects are masked from AQ_station_hour:
## 
##     AQI, AQI_Bucket, Benzene, CO, NH3, NO, NO2, NOx, O3, PM10, PM2.5,
##     SO2, StationId, Toluene, Xylene
## The following object is masked from AQ_stations:
## 
##     StationId
AQ_station_day [AQ_station_day == ""] <- NA

## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:
## PM2.5: 21625 PM10: 42706 NO: 17106 NO2: 16547 NOx: 15500 NH3: 48105 
## CO:  12998  SO2: 25204  O3: 25568 Benzene: 31455 Toluene: 38702 Xylene: 85137
AQ_station_day %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups:   AQI_Bucket [7]
##   AQI_Bucket       n
##   <chr>        <int>
## 1 Good          5510
## 2 Moderate     29417
## 3 Poor         11493
## 4 Satisfactory 23636
## 5 Severe        5207
## 6 Very Poor    11762
## 7 <NA>         21010
## Looks like Moderate entries are the highest ones, followed by Satisfactory 
## but third highest is NA entries...

AQ_city_day <- read.csv(“./datasets/city_day.csv”)

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_city_day)
## [1] 29531    16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_city_day)
## Rows: 29,531
## Columns: 16
## $ City       <chr> "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmeda…
## $ Date       <chr> "2015-01-01", "2015-01-02", "2015-01-03", "2015-01-04", "20…
## $ PM2.5      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ PM10       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ NO         <dbl> 0.92, 0.97, 17.40, 1.70, 22.10, 45.41, 112.16, 80.87, 29.16…
## $ NO2        <dbl> 18.22, 15.69, 19.30, 18.48, 21.42, 38.48, 40.62, 36.74, 31.…
## $ NOx        <dbl> 17.15, 16.46, 29.70, 17.97, 37.76, 81.50, 130.77, 96.75, 48…
## $ NH3        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ CO         <dbl> 0.92, 0.97, 17.40, 1.70, 22.10, 45.41, 112.16, 80.87, 29.16…
## $ SO2        <dbl> 27.64, 24.55, 29.07, 18.59, 39.33, 45.76, 32.28, 38.54, 58.…
## $ O3         <dbl> 133.36, 34.06, 30.70, 36.08, 39.31, 46.51, 33.47, 31.89, 25…
## $ Benzene    <dbl> 0.00, 3.68, 6.80, 4.43, 7.01, 5.42, 0.00, 0.00, 0.00, 0.00,…
## $ Toluene    <dbl> 0.02, 5.50, 16.40, 10.14, 18.89, 10.83, 0.00, 0.00, 0.00, 0…
## $ Xylene     <dbl> 0.00, 3.77, 2.25, 1.00, 2.78, 1.93, 0.00, 0.00, 0.00, 0.00,…
## $ AQI        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ AQI_Bucket <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_city_day)
##  [1] "City"       "Date"       "PM2.5"      "PM10"       "NO"        
##  [6] "NO2"        "NOx"        "NH3"        "CO"         "SO2"       
## [11] "O3"         "Benzene"    "Toluene"    "Xylene"     "AQI"       
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_city_day)
## 'data.frame':    29531 obs. of  16 variables:
##  $ City      : chr  "Ahmedabad" "Ahmedabad" "Ahmedabad" "Ahmedabad" ...
##  $ Date      : chr  "2015-01-01" "2015-01-02" "2015-01-03" "2015-01-04" ...
##  $ PM2.5     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PM10      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NO        : num  0.92 0.97 17.4 1.7 22.1 ...
##  $ NO2       : num  18.2 15.7 19.3 18.5 21.4 ...
##  $ NOx       : num  17.1 16.5 29.7 18 37.8 ...
##  $ NH3       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CO        : num  0.92 0.97 17.4 1.7 22.1 ...
##  $ SO2       : num  27.6 24.6 29.1 18.6 39.3 ...
##  $ O3        : num  133.4 34.1 30.7 36.1 39.3 ...
##  $ Benzene   : num  0 3.68 6.8 4.43 7.01 5.42 0 0 0 0 ...
##  $ Toluene   : num  0.02 5.5 16.4 10.14 18.89 ...
##  $ Xylene    : num  0 3.77 2.25 1 2.78 1.93 0 0 0 0 ...
##  $ AQI       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AQI_Bucket: chr  "" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_city_day)
##      City               Date               PM2.5             PM10        
##  Length:29531       Length:29531       Min.   :  0.04   Min.   :   0.01  
##  Class :character   Class :character   1st Qu.: 28.82   1st Qu.:  56.26  
##  Mode  :character   Mode  :character   Median : 48.57   Median :  95.68  
##                                        Mean   : 67.45   Mean   : 118.13  
##                                        3rd Qu.: 80.59   3rd Qu.: 149.75  
##                                        Max.   :949.99   Max.   :1000.00  
##                                        NA's   :4598     NA's   :11140    
##        NO              NO2              NOx              NH3        
##  Min.   :  0.02   Min.   :  0.01   Min.   :  0.00   Min.   :  0.01  
##  1st Qu.:  5.63   1st Qu.: 11.75   1st Qu.: 12.82   1st Qu.:  8.58  
##  Median :  9.89   Median : 21.69   Median : 23.52   Median : 15.85  
##  Mean   : 17.57   Mean   : 28.56   Mean   : 32.31   Mean   : 23.48  
##  3rd Qu.: 19.95   3rd Qu.: 37.62   3rd Qu.: 40.13   3rd Qu.: 30.02  
##  Max.   :390.68   Max.   :362.21   Max.   :467.63   Max.   :352.89  
##  NA's   :3582     NA's   :3585     NA's   :4185     NA's   :10328   
##        CO               SO2               O3            Benzene       
##  Min.   :  0.000   Min.   :  0.01   Min.   :  0.01   Min.   :  0.000  
##  1st Qu.:  0.510   1st Qu.:  5.67   1st Qu.: 18.86   1st Qu.:  0.120  
##  Median :  0.890   Median :  9.16   Median : 30.84   Median :  1.070  
##  Mean   :  2.249   Mean   : 14.53   Mean   : 34.49   Mean   :  3.281  
##  3rd Qu.:  1.450   3rd Qu.: 15.22   3rd Qu.: 45.57   3rd Qu.:  3.080  
##  Max.   :175.810   Max.   :193.86   Max.   :257.73   Max.   :455.030  
##  NA's   :2059      NA's   :3854     NA's   :4022     NA's   :5623     
##     Toluene            Xylene            AQI          AQI_Bucket       
##  Min.   :  0.000   Min.   :  0.00   Min.   :  13.0   Length:29531      
##  1st Qu.:  0.600   1st Qu.:  0.14   1st Qu.:  81.0   Class :character  
##  Median :  2.970   Median :  0.98   Median : 118.0   Mode  :character  
##  Mean   :  8.701   Mean   :  3.07   Mean   : 166.5                     
##  3rd Qu.:  9.150   3rd Qu.:  3.35   3rd Qu.: 208.0                     
##  Max.   :454.850   Max.   :170.37   Max.   :2049.0                     
##  NA's   :8041      NA's   :18109    NA's   :4681
attach(AQ_city_day)
## The following objects are masked from AQ_station_day:
## 
##     AQI, AQI_Bucket, Benzene, CO, Date, NH3, NO, NO2, NOx, O3, PM10,
##     PM2.5, SO2, Toluene, Xylene
## The following objects are masked from AQ_station_hour:
## 
##     AQI, AQI_Bucket, Benzene, CO, NH3, NO, NO2, NOx, O3, PM10, PM2.5,
##     SO2, Toluene, Xylene
## The following object is masked from AQ_stations:
## 
##     City
AQ_city_day [AQ_city_day == ""] <- NA

## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:

## PM2.5: 4598  PM10: 11140  NO: 3582  NO2: 3585 NOx: 4185 NH3: 10328 
## CO: 2059  SO2: 3854 O3: 4022  Benzene: 5623  Toluene: 8041 Xylene: 18109
AQ_city_day %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups:   AQI_Bucket [7]
##   AQI_Bucket       n
##   <chr>        <int>
## 1 Good          1341
## 2 Moderate      8829
## 3 Poor          2781
## 4 Satisfactory  8224
## 5 Severe        1338
## 6 Very Poor     2337
## 7 <NA>          4681
## Looks like Moderate entries are the highest ones, followed by Satisfactory 
## but third highest is NA entries...

AQ_city_hour: city_hour.csv

## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_city_hour)
## [1] 707875     16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_city_hour)
## Rows: 707,875
## Columns: 16
## $ City       <chr> "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmeda…
## $ Datetime   <chr> "2015-01-01 01:00:00", "2015-01-01 02:00:00", "2015-01-01 0…
## $ PM2.5      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ PM10       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ NO         <dbl> 1.00, 0.02, 0.08, 0.30, 0.12, 0.33, 0.45, 1.03, 1.47, 2.05,…
## $ NO2        <dbl> 40.01, 27.75, 19.32, 16.45, 14.90, 15.95, 15.94, 16.66, 16.…
## $ NOx        <dbl> 36.37, 19.73, 11.08, 9.20, 7.85, 10.82, 12.47, 16.48, 18.02…
## $ NH3        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ CO         <dbl> 1.00, 0.02, 0.08, 0.30, 0.12, 0.33, 0.45, 1.03, 1.47, 2.05,…
## $ SO2        <dbl> 122.07, 85.90, 52.83, 39.53, 32.63, 29.87, 27.41, 20.92, 16…
## $ O3         <dbl> NA, NA, NA, 153.58, NA, 64.25, 191.96, 177.21, 122.08, NA, …
## $ Benzene    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Toluene    <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,…
## $ Xylene     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ AQI        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ AQI_Bucket <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_city_hour)
##  [1] "City"       "Datetime"   "PM2.5"      "PM10"       "NO"        
##  [6] "NO2"        "NOx"        "NH3"        "CO"         "SO2"       
## [11] "O3"         "Benzene"    "Toluene"    "Xylene"     "AQI"       
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_city_hour)
## 'data.frame':    707875 obs. of  16 variables:
##  $ City      : chr  "Ahmedabad" "Ahmedabad" "Ahmedabad" "Ahmedabad" ...
##  $ Datetime  : chr  "2015-01-01 01:00:00" "2015-01-01 02:00:00" "2015-01-01 03:00:00" "2015-01-01 04:00:00" ...
##  $ PM2.5     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PM10      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NO        : num  1 0.02 0.08 0.3 0.12 0.33 0.45 1.03 1.47 2.05 ...
##  $ NO2       : num  40 27.8 19.3 16.4 14.9 ...
##  $ NOx       : num  36.37 19.73 11.08 9.2 7.85 ...
##  $ NH3       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CO        : num  1 0.02 0.08 0.3 0.12 0.33 0.45 1.03 1.47 2.05 ...
##  $ SO2       : num  122.1 85.9 52.8 39.5 32.6 ...
##  $ O3        : num  NA NA NA 154 NA ...
##  $ Benzene   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Toluene   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Xylene    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AQI       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AQI_Bucket: chr  "" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_city_hour)
##      City             Datetime             PM2.5              PM10        
##  Length:707875      Length:707875      Min.   :   0.01   Min.   :   0.01  
##  Class :character   Class :character   1st Qu.:  26.20   1st Qu.:  52.38  
##  Mode  :character   Mode  :character   Median :  46.42   Median :  91.50  
##                                        Mean   :  67.62   Mean   : 119.08  
##                                        3rd Qu.:  79.49   3rd Qu.: 147.52  
##                                        Max.   : 999.99   Max.   :1000.00  
##                                        NA's   :145088    NA's   :296737   
##        NO              NO2              NOx              NH3        
##  Min.   :  0.01   Min.   :  0.01   Min.   :  0.00   Min.   :  0.01  
##  1st Qu.:  3.84   1st Qu.: 10.81   1st Qu.: 10.66   1st Qu.:  8.12  
##  Median :  7.96   Median : 20.32   Median : 20.79   Median : 15.38  
##  Mean   : 17.42   Mean   : 28.89   Mean   : 32.29   Mean   : 23.61  
##  3rd Qu.: 16.15   3rd Qu.: 36.35   3rd Qu.: 37.15   3rd Qu.: 29.23  
##  Max.   :499.99   Max.   :499.51   Max.   :498.61   Max.   :499.97  
##  NA's   :116632   NA's   :117122   NA's   :123224   NA's   :272542  
##        CO              SO2               O3            Benzene      
##  Min.   :  0.00   Min.   :  0.01   Min.   :  0.01   Min.   :  0.00  
##  1st Qu.:  0.42   1st Qu.:  4.88   1st Qu.: 13.42   1st Qu.:  0.05  
##  Median :  0.80   Median :  8.37   Median : 26.24   Median :  0.86  
##  Mean   :  2.18   Mean   : 14.04   Mean   : 34.80   Mean   :  3.09  
##  3rd Qu.:  1.37   3rd Qu.: 14.78   3rd Qu.: 47.62   3rd Qu.:  2.75  
##  Max.   :498.57   Max.   :199.96   Max.   :497.62   Max.   :498.07  
##  NA's   :86517    NA's   :130373   NA's   :129208   NA's   :163646  
##     Toluene           Xylene            AQI          AQI_Bucket       
##  Min.   :  0.00   Min.   :  0.0    Min.   :   8.0   Length:707875     
##  1st Qu.:  0.37   1st Qu.:  0.1    1st Qu.:  79.0   Class :character  
##  Median :  2.59   Median :  0.8    Median : 116.0   Mode  :character  
##  Mean   :  8.66   Mean   :  3.1    Mean   : 166.4                     
##  3rd Qu.:  8.41   3rd Qu.:  3.1    3rd Qu.: 208.0                     
##  Max.   :499.40   Max.   :500.0    Max.   :3133.0                     
##  NA's   :220607   NA's   :455829   NA's   :129080
attach(AQ_city_hour)
## The following objects are masked from AQ_city_day:
## 
##     AQI, AQI_Bucket, Benzene, CO, City, NH3, NO, NO2, NOx, O3, PM10,
##     PM2.5, SO2, Toluene, Xylene
## The following objects are masked from AQ_station_day:
## 
##     AQI, AQI_Bucket, Benzene, CO, NH3, NO, NO2, NOx, O3, PM10, PM2.5,
##     SO2, Toluene, Xylene
## The following objects are masked from AQ_station_hour:
## 
##     AQI, AQI_Bucket, Benzene, CO, Datetime, NH3, NO, NO2, NOx, O3,
##     PM10, PM2.5, SO2, Toluene, Xylene
## The following object is masked from AQ_stations:
## 
##     City
AQ_city_hour [AQ_city_hour == ""] <- NA

## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:
## PM2.5: 145088 PM10: 296737 NO: 116632   NO2: 117122  NOx: 123224 NH3:  272542     
## CO: 86517  SO2: 130373   O3: 129208 Benzene: 163646   Toluene: 220607 Xylene: 455829   
AQ_city_hour %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups:   AQI_Bucket [7]
##   AQI_Bucket        n
##   <chr>         <int>
## 1 Good          38611
## 2 Moderate     198991
## 3 Poor          66654
## 4 Satisfactory 189434
## 5 Severe        27650
## 6 Very Poor     57455
## 7 <NA>         129080
## Looks like Moderate entries are the highest ones, followed by Satisfactory 
## but third highest is NA entries...

Airport_delay: Aiport_Delay.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Airport_delay)
## [1] 14952    22
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Airport_delay)
## Rows: 14,952
## Columns: 22
## $ Date                                            <chr> "28-1-18", "28-1-18", …
## $ Departure.Airport                               <chr> "BLR", "CCU", "DEL", "…
## $ Departure.Airport.Rating..out.of.10.            <dbl> NA, NA, 7.99, 7.29, NA…
## $ Departure.Airport.On.Time.Rating..out.of.10.    <dbl> NA, NA, 7.3, 6.2, NA, …
## $ Departure.Airport.Service.Rating..out.of.10.    <dbl> NA, NA, 9.1, 9.0, NA, …
## $ Arrival.Airport                                 <chr> "DEL", "DEL", "HYD", "…
## $ Arrival.Airport.Rating..out.of.10.              <dbl> 7.99, 7.99, 8.27, 7.99…
## $ Arrival.Airport.On.Time.Rating..out.of.10.      <dbl> 7.3, 7.3, 7.8, 7.3, 6.…
## $ Arrival.Airport.Service.Rating..out.of.10.      <dbl> 9.1, 9.1, 9.0, 9.1, 9.…
## $ Airplane.Type                                   <chr> "", "", "", "", "", "A…
## $ Expected.Departure.Time                         <chr> "6:10", "7:00", "7:05"…
## $ Departure.Time                                  <chr> "6:10", "7:01", "7:33"…
## $ Departure.Delay                                 <chr> "0:00:00", "0:01:00", …
## $ Duration                                        <chr> "2:20", "2:09", "1:46"…
## $ Expected.Arrival.Time                           <chr> "8:55", "9:10", "9:10"…
## $ Arrival.Time                                    <chr> "8:30", "9:10", "9:19"…
## $ Arrival.Time.Delay                              <chr> "-0:25:00", "0:00:00",…
## $ Carrier                                         <chr> "Air India", "Air Indi…
## $ Carrier.Rating..out.of.10.                      <dbl> 6.6, 6.6, 6.6, 6.6, 6.…
## $ Carrier.Market.Share..out.of.100.               <dbl> 12.0, 12.0, 12.0, 12.0…
## $ Carrier.Load.Factor..out.of.100.                <dbl> 80.75, 80.75, 80.75, 8…
## $ Carrier.On.Time.Performance.Rating..out.of.100. <dbl> 70.3, 70.3, 70.3, 70.3…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Airport_delay)
##  [1] "Date"                                           
##  [2] "Departure.Airport"                              
##  [3] "Departure.Airport.Rating..out.of.10."           
##  [4] "Departure.Airport.On.Time.Rating..out.of.10."   
##  [5] "Departure.Airport.Service.Rating..out.of.10."   
##  [6] "Arrival.Airport"                                
##  [7] "Arrival.Airport.Rating..out.of.10."             
##  [8] "Arrival.Airport.On.Time.Rating..out.of.10."     
##  [9] "Arrival.Airport.Service.Rating..out.of.10."     
## [10] "Airplane.Type"                                  
## [11] "Expected.Departure.Time"                        
## [12] "Departure.Time"                                 
## [13] "Departure.Delay"                                
## [14] "Duration"                                       
## [15] "Expected.Arrival.Time"                          
## [16] "Arrival.Time"                                   
## [17] "Arrival.Time.Delay"                             
## [18] "Carrier"                                        
## [19] "Carrier.Rating..out.of.10."                     
## [20] "Carrier.Market.Share..out.of.100."              
## [21] "Carrier.Load.Factor..out.of.100."               
## [22] "Carrier.On.Time.Performance.Rating..out.of.100."
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Airport_delay)
## 'data.frame':    14952 obs. of  22 variables:
##  $ Date                                           : chr  "28-1-18" "28-1-18" "28-1-18" "28-1-18" ...
##  $ Departure.Airport                              : chr  "BLR" "CCU" "DEL" "BOM" ...
##  $ Departure.Airport.Rating..out.of.10.           : num  NA NA 7.99 7.29 NA 7.99 NA NA 7.99 NA ...
##  $ Departure.Airport.On.Time.Rating..out.of.10.   : num  NA NA 7.3 6.2 NA 7.3 NA NA 7.3 NA ...
##  $ Departure.Airport.Service.Rating..out.of.10.   : num  NA NA 9.1 9 NA 9.1 NA NA 9.1 NA ...
##  $ Arrival.Airport                                : chr  "DEL" "DEL" "HYD" "DEL" ...
##  $ Arrival.Airport.Rating..out.of.10.             : num  7.99 7.99 8.27 7.99 7.29 8.27 7.29 7.99 8.27 7.29 ...
##  $ Arrival.Airport.On.Time.Rating..out.of.10.     : num  7.3 7.3 7.8 7.3 6.2 7.8 6.2 7.3 7.8 6.2 ...
##  $ Arrival.Airport.Service.Rating..out.of.10.     : num  9.1 9.1 9 9.1 9 9 9 9.1 9 9 ...
##  $ Airplane.Type                                  : chr  "" "" "" "" ...
##  $ Expected.Departure.Time                        : chr  "6:10" "7:00" "7:05" "7:00" ...
##  $ Departure.Time                                 : chr  "6:10" "7:01" "7:33" "7:07" ...
##  $ Departure.Delay                                : chr  "0:00:00" "0:01:00" "0:28:00" "0:07:00" ...
##  $ Duration                                       : chr  "2:20" "2:09" "1:46" "1:40" ...
##  $ Expected.Arrival.Time                          : chr  "8:55" "9:10" "9:10" "9:05" ...
##  $ Arrival.Time                                   : chr  "8:30" "9:10" "9:19" "8:47" ...
##  $ Arrival.Time.Delay                             : chr  "-0:25:00" "0:00:00" "0:09:00" "-0:18:00" ...
##  $ Carrier                                        : chr  "Air India" "Air India" "Air India" "Air India" ...
##  $ Carrier.Rating..out.of.10.                     : num  6.6 6.6 6.6 6.6 6.6 7.2 7.2 7.9 7.9 7.9 ...
##  $ Carrier.Market.Share..out.of.100.              : num  12 12 12 12 12 8.8 8.8 39.7 39.7 39.7 ...
##  $ Carrier.Load.Factor..out.of.100.               : num  80.8 80.8 80.8 80.8 80.8 ...
##  $ Carrier.On.Time.Performance.Rating..out.of.100.: num  70.3 70.3 70.3 70.3 70.3 91.8 91.8 87.4 87.4 87.4 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Airport_delay)
##      Date           Departure.Airport  Departure.Airport.Rating..out.of.10.
##  Length:14952       Length:14952       Min.   :7.290                       
##  Class :character   Class :character   1st Qu.:7.290                       
##  Mode  :character   Mode  :character   Median :7.990                       
##                                        Mean   :7.741                       
##                                        3rd Qu.:7.990                       
##                                        Max.   :8.270                       
##                                        NA's   :10043                       
##  Departure.Airport.On.Time.Rating..out.of.10.
##  Min.   :6.200                               
##  1st Qu.:6.200                               
##  Median :7.300                               
##  Mean   :6.908                               
##  3rd Qu.:7.300                               
##  Max.   :7.800                               
##  NA's   :10043                               
##  Departure.Airport.Service.Rating..out.of.10. Arrival.Airport   
##  Min.   :9.000                                Length:14952      
##  1st Qu.:9.000                                Class :character  
##  Median :9.100                                Mode  :character  
##  Mean   :9.064                                                  
##  3rd Qu.:9.100                                                  
##  Max.   :9.100                                                  
##  NA's   :10043                                                  
##  Arrival.Airport.Rating..out.of.10. Arrival.Airport.On.Time.Rating..out.of.10.
##  Min.   :7.29                       Min.   :6.200                             
##  1st Qu.:7.99                       1st Qu.:7.300                             
##  Median :7.99                       Median :7.300                             
##  Mean   :7.91                       Mean   :7.187                             
##  3rd Qu.:7.99                       3rd Qu.:7.300                             
##  Max.   :8.27                       Max.   :7.800                             
##                                                                               
##  Arrival.Airport.Service.Rating..out.of.10. Airplane.Type     
##  Min.   :9.000                              Length:14952      
##  1st Qu.:9.000                              Class :character  
##  Median :9.100                              Mode  :character  
##  Mean   :9.059                                                
##  3rd Qu.:9.100                                                
##  Max.   :9.100                                                
##                                                               
##  Expected.Departure.Time Departure.Time     Departure.Delay   
##  Length:14952            Length:14952       Length:14952      
##  Class :character        Class :character   Class :character  
##  Mode  :character        Mode  :character   Mode  :character  
##                                                               
##                                                               
##                                                               
##                                                               
##    Duration         Expected.Arrival.Time Arrival.Time       Arrival.Time.Delay
##  Length:14952       Length:14952          Length:14952       Length:14952      
##  Class :character   Class :character      Class :character   Class :character  
##  Mode  :character   Mode  :character      Mode  :character   Mode  :character  
##                                                                                
##                                                                                
##                                                                                
##                                                                                
##    Carrier          Carrier.Rating..out.of.10.
##  Length:14952       Min.   :6.600             
##  Class :character   1st Qu.:6.800             
##  Mode  :character   Median :7.200             
##                     Mean   :7.531             
##                     3rd Qu.:7.900             
##                     Max.   :9.200             
##                                               
##  Carrier.Market.Share..out.of.100. Carrier.Load.Factor..out.of.100.
##  Min.   : 3.6                      Min.   :80.75                   
##  1st Qu.: 4.0                      1st Qu.:81.80                   
##  Median :12.0                      Median :86.00                   
##  Mean   :13.2                      Mean   :86.88                   
##  3rd Qu.:13.1                      3rd Qu.:93.30                   
##  Max.   :39.7                      Max.   :93.90                   
##                                                                    
##  Carrier.On.Time.Performance.Rating..out.of.100.
##  Min.   :70.30                                  
##  1st Qu.:74.70                                  
##  Median :87.40                                  
##  Mean   :83.14                                  
##  3rd Qu.:89.10                                  
##  Max.   :91.80                                  
## 
attach(Airport_delay)
## The following object is masked from AQ_city_day:
## 
##     Date
## The following object is masked from AQ_station_day:
## 
##     Date
Airport_delay [Airport_delay == ""] <- NA

Airport_delay %>% group_by(Departure.Airport, Departure.Airport.On.Time.Rating..out.of.10.)%>%summarize()
## `summarise()` has grouped output by 'Departure.Airport'. You can override using
## the `.groups` argument.
## # A tibble: 5 × 2
## # Groups:   Departure.Airport [5]
##   Departure.Airport Departure.Airport.On.Time.Rating..out.of.10.
##   <chr>                                                    <dbl>
## 1 BLR                                                       NA  
## 2 BOM                                                        6.2
## 3 CCU                                                       NA  
## 4 DEL                                                        7.3
## 5 HYD                                                        7.8
##Mumbai seems to have the worst rating for departure on time performance

Airport_delay %>% group_by(Arrival.Airport, Arrival.Airport.On.Time.Rating..out.of.10.)%>%summarize()
## `summarise()` has grouped output by 'Arrival.Airport'. You can override using
## the `.groups` argument.
## # A tibble: 3 × 2
## # Groups:   Arrival.Airport [3]
##   Arrival.Airport Arrival.Airport.On.Time.Rating..out.of.10.
##   <chr>                                                <dbl>
## 1 BOM                                                    6.2
## 2 DEL                                                    7.3
## 3 HYD                                                    7.8
##Mumbai seems to have the worst rating for Arrival on time performance as well

#################### Cleaning Datasets ######################

## Remove the entries from the table where tavg is NA
New_Weather_Bangalore <- Weather_Bangalore[complete.cases(Weather_Bangalore),]
New_Weather_Delhi <- Weather_Delhi[complete.cases(Weather_Delhi),]
New_Weather_Lucknow <- Weather_Lucknow[complete.cases(Weather_Lucknow),]
New_Weather_Mumbai <- Weather_Mumbai[complete.cases(Weather_Mumbai),]
New_Weather_Jodhpur <- Weather_Jodhpur[complete.cases(Weather_Jodhpur),]

## For Bhubhenshwar and Rourkela, we need to first remove the columns snow and tsun which has no valid entries
## We can also remove the wdir, wspd, pressure columns as the other stations are not having them
## And hence having them does not seem to add value for the scope of this analysis
Standard_Weather_Bhubhneshwar <- subset(Weather_Bhubhneshwar, select = -c(snow,wdir,wspd,pres,tsun,wpgt))
New_Weather_Bhubhneshwar <- Standard_Weather_Bhubhneshwar[complete.cases(Standard_Weather_Bhubhneshwar),] 

Standard_Weather_Rourkela <- subset(Weather_Rourkela, select = -c(snow,wdir,wspd,pres,tsun,wpgt))
New_Weather_Rourkela <- Standard_Weather_Rourkela[complete.cases(Standard_Weather_Rourkela),] 


## When it comes to AQI stations, we need only active stations

New_AQ_stations <- AQ_stations %>% filter(Status == "Active")
New_AQ_station_hour <- AQ_station_hour[complete.cases(AQ_station_hour),]
New_AQ_station_day <- AQ_station_day[complete.cases(AQ_station_day),]
New_AQ_city_hour <- AQ_city_hour[complete.cases(AQ_city_hour),]
New_AQ_city_day <- AQ_city_day[complete.cases(AQ_city_day),]

## Clean the Airport Delay data too
New_Airport_delay <- Airport_delay[complete.cases(Airport_delay),]

Detect outliers Clean Datasets

Since our analysis is to find if extreme weather conditions affect the flight traffic, we are

really looking for outliers unlike normal cases where we tend to avoid outliers

Exploratoray Analysis of Bangalore Weather Dataset

hist(x=New_Weather_Bangalore$tavg, main = "Bangalore Average Temparature")

## Data outside <20 and >30 are outliers for Bangalore average 

hist(x=New_Weather_Bangalore$tmin, main = "Bangalore Min Temparature")

## Data outside <16 are outliers for Bangalore min 

hist(x=New_Weather_Bangalore$tmax, main = "Bangalore Max  Temparature")

## Data outside >35 are outliers for Bangalore min 

hist(x=New_Weather_Bangalore$prcp, main = "Bangalore Precipitation", breaks = 5)

## Extreme cases are above 50

## So lets make special dataset
Special_Weather_Bangalore <- New_Weather_Bangalore %>% filter((tavg < 20) | (tavg>30) | (tmin < 16) | (tmax > 35) | (prcp > 50))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Bangalore, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Special_Weather_Bangalore$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 16 to 22
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Bangalore <- Special_Weather_Bangalore %>% filter((tmin > 16) & (tmin < 22))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Bangalore, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Ext_Special_Weather_Bangalore$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Chennai Weather Dataset

Exploratory Analysis of Delhi Weather Dataset

hist(x=New_Weather_Delhi$tavg, main = "Delhi Average Temparature")

## Data outside <15 and >35 are outliers for Delhi average 

hist(x=New_Weather_Delhi$tmin, main = "Delhi Min Temparature")

## Data outside <16 are outliers for Delhi min 

hist(x=New_Weather_Delhi$tmax, main = "Delhi Max  Temparature")

## Data outside >35 are outliers for Delhi min 

hist(x=New_Weather_Delhi$prcp, main = "Delhi Precipitation", breaks = 5)

## Extreme cases are above 50

## So lets make special dataset
Special_Weather_Delhi <- New_Weather_Delhi %>% filter((tavg < 15) | (tavg>35) | (tmin < 10) | (tmax > 30) | (prcp > 50))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Delhi, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Special_Weather_Delhi$prcp/50) +
  labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 20 to 30
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Delhi <- Special_Weather_Delhi %>% filter((tmin > 20) & (tmin < 30))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Delhi, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Ext_Special_Weather_Delhi$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Lucknow Weather Dataset

hist(x=New_Weather_Lucknow$tavg, main = "Lucknow Average Temparature")

## Data outside <16 and >33 are outliers for Lucknow average 

hist(x=New_Weather_Lucknow$tmin, main = "Lucknow Min Temparature")

## Data outside <15 are outliers for Lucknow min 

hist(x=New_Weather_Lucknow$tmax, main = "Lucknow Max  Temparature")

## Data outside >35 are outliers for Lucknow min 

hist(x=New_Weather_Lucknow$prcp, main = "Lucknow Precipitation", breaks = 5)

## Extreme cases are above 50

## So lets make special dataset
Special_Weather_Lucknow <- New_Weather_Lucknow %>% filter((tavg < 16) | (tavg>33) | (tmin < 15) | (tmax > 30) | (prcp > 50))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Lucknow, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Special_Weather_Lucknow$prcp/50) +
  labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 20 to 30
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Lucknow <- Special_Weather_Lucknow %>% filter((tmin > 20) & (tmin < 30))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Lucknow, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Ext_Special_Weather_Lucknow$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Mumbai Weather Dataset

hist(x=New_Weather_Mumbai$tavg, main = "Mumbai Average Temparature")

## Data outside <25 and >30 are outliers for Mumbai average 

hist(x=New_Weather_Mumbai$tmin, main = "Mumbai Min Temparature")

## Data outside <17 are outliers for Mumbai min 

hist(x=New_Weather_Mumbai$tmax, main = "Mumbai Max  Temparature")

## Data outside >35 are outliers for Mumbai min 

hist(x=New_Weather_Mumbai$prcp, main = "Mumbai Precipitation", breaks = 5)

## Extreme cases are above 50

## So lets make special dataset
Special_Weather_Mumbai <- New_Weather_Mumbai %>% filter((tavg < 25) | (tavg>30) | (tmin < 17) | (tmax > 35) | (prcp > 50))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Mumbai, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Special_Weather_Mumbai$prcp/50) +
  labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 22 to 27
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Mumbai <- Special_Weather_Mumbai %>% filter((tmin > 22) & (tmin < 27))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Mumbai, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Ext_Special_Weather_Mumbai$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Jodhpur Weather Dataset

hist(x=New_Weather_Jodhpur$tavg, main = "Jodhpur Average Temparature")

## Data outside <22 and >28 are outliers for Jodhpur average 

hist(x=New_Weather_Jodhpur$tmin, main = "Jodhpur Min Temparature")

## Data outside <16 are outliers for Jodhpur min 

hist(x=New_Weather_Jodhpur$tmax, main = "Jodhpur Max  Temparature")

## Data outside >33 are outliers for Jodhpur min 

hist(x=New_Weather_Jodhpur$prcp, main = "Jodhpur Precipitation", breaks = 5)

## Extreme cases are above 50

## So lets make special dataset
Special_Weather_Jodhpur <- New_Weather_Jodhpur %>% filter((tavg < 22) | (tavg>28) | (tmin < 16) | (tmax > 33) | (prcp > 50))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Jodhpur, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Special_Weather_Jodhpur$prcp/50) +
  labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 17 to 23
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Jodhpur <- Special_Weather_Jodhpur %>% filter((tmin > 17) & (tmin < 23))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Jodhpur, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Ext_Special_Weather_Jodhpur$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Bhubhenshwar Weather Dataset

hist(x=New_Weather_Bhubhneshwar$tavg, main = "Bhubhenshwar Average Temparature")

## Data outside <24 and >32 are outliers for Bhubhenshwar average 

hist(x=New_Weather_Bhubhneshwar$tmin, main = "Bhubhenshwar Min Temparature")

## Data outside <15 are outliers for Bhubhenshwar min 

hist(x=New_Weather_Bhubhneshwar$tmax, main = "Bhubhenshwar Max  Temparature")

## Data outside >35 are outliers for Bhubhenshwar min 

hist(x=New_Weather_Bhubhneshwar$prcp, main = "Bhubhenshwar Precipitation", breaks = 5)

## Extreme cases are above 50

## So lets make special dataset
Special_Weather_Bhubhenshwar <- New_Weather_Bhubhneshwar %>% filter((tavg < 24) | (tavg>32) | (tmin < 15) | (tmax > 35) | (prcp > 50))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Bhubhenshwar, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Special_Weather_Bhubhenshwar$prcp/50) +
  labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 17 to 27
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Bhubhenshwar <- Special_Weather_Bhubhenshwar %>% filter((tmin > 17) & (tmin < 27))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Bhubhenshwar, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Ext_Special_Weather_Bhubhenshwar$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Rourkela Weather Dataset

hist(x=New_Weather_Rourkela$tavg, main = "Rourkela Average Temparature")

## Data outside <20 and >32 are outliers for Rourkela average 

hist(x=New_Weather_Rourkela$tmin, main = "Rourkela Min Temparature")

## Data outside <15 are outliers for Rourkela min 

hist(x=New_Weather_Rourkela$tmax, main = "Rourkela Max  Temparature")

## Data outside >35 are outliers for Rourkela min 

hist(x=New_Weather_Rourkela$prcp, main = "Rourkela Precipitation", breaks = 5)

## Extreme cases are above 40

## So lets make special dataset
Special_Weather_Rourkela <- New_Weather_Rourkela %>% filter((tavg < 20) | (tavg>32) | (tmin < 15) | (tmax > 30) | (prcp > 40))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Rourkela, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Special_Weather_Rourkela$prcp/50) +
  labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 22 to 27
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Rourkela <- Special_Weather_Rourkela %>% filter((tmin > 22) & (tmin < 27))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Rourkela, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Ext_Special_Weather_Rourkela$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of AQI data station wise

head(New_AQ_station_hour)
##    StationId            Datetime  PM2.5   PM10   NO   NO2   NOx  NH3  CO   SO2
## 17     AP001 2017-11-25 09:00:00 104.00 148.50 1.93 23.00 13.75 9.80 0.1 15.30
## 18     AP001 2017-11-25 10:00:00  94.50 142.00 1.33 16.25  9.75 9.65 0.1 17.00
## 19     AP001 2017-11-25 11:00:00  82.75 126.50 1.47 14.83  9.07 9.70 0.1 15.40
## 22     AP001 2017-11-25 14:00:00  68.50 117.00 1.35 13.60  8.35 7.40 0.1 21.80
## 23     AP001 2017-11-25 15:00:00  69.25 112.25 1.52 11.80  7.55 9.25 0.1 21.38
## 24     AP001 2017-11-25 16:00:00  70.00 107.00 2.80 30.33 18.40 6.15 0.1 18.90
##        O3 Benzene Toluene Xylene AQI AQI_Bucket
## 17 117.62    0.30   10.40   0.23 155   Moderate
## 18 136.23    0.28    7.10   0.15 159   Moderate
## 19 149.92    0.20    4.55   0.08 173   Moderate
## 22 161.70    0.10    2.30   0.00 191   Moderate
## 23 161.68    0.10    2.35   0.00 191   Moderate
## 24 147.97    0.10    3.70   0.00 191   Moderate
# Lets see the performance of the AQI over years
AQ_station_Day_Sep <- New_AQ_station_hour %>% separate(Datetime, c('Date', 'Time'), sep =" ") %>% separate(Time, c('Hr', 'Min', 'Sec'), sep=":") %>% mutate(Hour = as.numeric(Hr))

AQ_station_Day_Duration <- AQ_station_Day_Sep %>% mutate(Duration=cut(Hour, breaks=c(-1, 6, 18, 24),labels=c("Early_Morning","Day","Night")))

AQI_Over_Years <- AQ_station_Day_Duration%>% group_by(YEAR = year(ymd(Date)), AQI_Bucket) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Over_Years, aes(x = YEAR, y = Mean_AQI, color=AQI_Bucket))+  geom_line()

## It appears that 'Severe' and 'Poor' cases didnt exist too predominantly until 2017 from which these
## two gained at the behest of 'Good' AQI cases


# Lets see the performance of the AQI over a day in every year

AQI_Over_Time <- AQ_station_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Over_Time, aes(x = YEAR, y = Mean_AQI, color = Duration))+  geom_point(size = AQI_Over_Time$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## We can see that 2017 had the worst air quality index but worst was during day time
## Things slowed down in the years later but in them, night time pollution was high.
## In all cases, early morning pollution was the lowest.


# Lets see the performance of the AQI monthwise
AQI_monthwise <- AQ_station_Day_Duration %>% group_by(Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.
ggplot(AQI_monthwise, aes(x = Month, y = Mean_AQI, color = Duration))+  geom_point(size = AQI_monthwise$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## We can see that the colder months - i.e., from Oct to Feb, the AQI is the worst, its bad during summer but it appears the best in monsoon season.

AQI_Over_month <- AQ_station_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR', 'Month'. You can override using the
## `.groups` argument.
# Lets see if how this works out yearwise and monthwise
ggplot(AQI_Over_month, aes(x = Month, y = Mean_AQI, color = Duration))+  geom_point(size = AQI_Over_month$Duration) + facet_wrap(~YEAR)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## We can see the same trend every year - i.e., the colder months has the worst AQI while the monsoon has the best AQI while summer/spring time having the intermediate values

## Now lets report this city wise - probably for the Month wise combination
AQI_Stationwise <- AQ_station_Day_Duration %>% group_by(Station = StationId, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Station', 'YEAR', 'Month'. You can
## override using the `.groups` argument.
ggplot(AQI_Stationwise, aes(x = Month, y = Mean_AQI, color = Station))+  geom_point(shape = AQI_Stationwise$Duration)

## Across stations, the trend seems to be the same - i.e., worst during winter, intermediate during spring/summer, best during monsoon.

## Now out of the 19 stations, we are very interested on just interested on Delhi for which we are going to do air traffic impact analysis - so lets filter them and zoom into their performance alone
AQI_Delhi_Station <- AQ_station_Day_Duration %>% filter( (StationId == "DL001") | (StationId == "DL019")) %>% group_by(Station = StationId, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Station', 'YEAR', 'Month'. You can
## override using the `.groups` argument.
ggplot(AQI_Delhi_Station, aes(x = Month, y = Mean_AQI, color = Duration))+  geom_point(size = AQI_Delhi_Station$Duration) + facet_wrap(~YEAR)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

# Lets see how AQ day data is different from station hour wise data
New_AQ_station_day_Years <- New_AQ_station_day%>% group_by(YEAR = year(ymd(Date)), AQI_Bucket) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
head(AQ_station_Day_Duration)
##    StationId       Date Hr Min Sec  PM2.5   PM10   NO   NO2   NOx  NH3  CO
## 17     AP001 2017-11-25 09  00  00 104.00 148.50 1.93 23.00 13.75 9.80 0.1
## 18     AP001 2017-11-25 10  00  00  94.50 142.00 1.33 16.25  9.75 9.65 0.1
## 19     AP001 2017-11-25 11  00  00  82.75 126.50 1.47 14.83  9.07 9.70 0.1
## 22     AP001 2017-11-25 14  00  00  68.50 117.00 1.35 13.60  8.35 7.40 0.1
## 23     AP001 2017-11-25 15  00  00  69.25 112.25 1.52 11.80  7.55 9.25 0.1
## 24     AP001 2017-11-25 16  00  00  70.00 107.00 2.80 30.33 18.40 6.15 0.1
##      SO2     O3 Benzene Toluene Xylene AQI AQI_Bucket Hour Duration
## 17 15.30 117.62    0.30   10.40   0.23 155   Moderate    9      Day
## 18 17.00 136.23    0.28    7.10   0.15 159   Moderate   10      Day
## 19 15.40 149.92    0.20    4.55   0.08 173   Moderate   11      Day
## 22 21.80 161.70    0.10    2.30   0.00 191   Moderate   14      Day
## 23 21.38 161.68    0.10    2.35   0.00 191   Moderate   15      Day
## 24 18.90 147.97    0.10    3.70   0.00 191   Moderate   16      Day
## There seems to be nothing new that we can derive out of the station day wise that we can't derive out of 
## station hour wise data. so no further analysis needed over here

Exploratory Analysis of AQI data city wise

## Lets look at City wise hourly AQI data
head(New_AQ_city_hour)
##            City            Datetime  PM2.5   PM10   NO   NO2   NOx  NH3  CO
## 50889 Amaravati 2017-11-25 09:00:00 104.00 148.50 1.93 23.00 13.75 9.80 0.1
## 50890 Amaravati 2017-11-25 10:00:00  94.50 142.00 1.33 16.25  9.75 9.65 0.1
## 50891 Amaravati 2017-11-25 11:00:00  82.75 126.50 1.47 14.83  9.07 9.70 0.1
## 50894 Amaravati 2017-11-25 14:00:00  68.50 117.00 1.35 13.60  8.35 7.40 0.1
## 50895 Amaravati 2017-11-25 15:00:00  69.25 112.25 1.52 11.80  7.55 9.25 0.1
## 50896 Amaravati 2017-11-25 16:00:00  70.00 107.00 2.80 30.33 18.40 6.15 0.1
##         SO2     O3 Benzene Toluene Xylene AQI AQI_Bucket
## 50889 15.30 117.62    0.30   10.40   0.23 155   Moderate
## 50890 17.00 136.23    0.28    7.10   0.15 159   Moderate
## 50891 15.40 149.92    0.20    4.55   0.08 173   Moderate
## 50894 21.80 161.70    0.10    2.30   0.00 191   Moderate
## 50895 21.38 161.68    0.10    2.35   0.00 191   Moderate
## 50896 18.90 147.97    0.10    3.70   0.00 191   Moderate
# Lets see the performance of the AQI over years
AQ_city_Day_Sep <- New_AQ_city_hour %>% separate(Datetime, c('Date', 'Time'), sep =" ") %>% separate(Time, c('Hr', 'Min', 'Sec'), sep=":") %>% mutate(Hour = as.numeric(Hr))

AQ_city_Day_Duration <- AQ_city_Day_Sep %>% mutate(Duration=cut(Hour, breaks=c(-1, 6, 18, 24),labels=c("Early_Morning","Day","Night")))
##
AQI_City_Over_Years <- AQ_city_Day_Duration%>% group_by(YEAR = year(ymd(Date)), AQI_Bucket) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
AQI_City_Over_Years
## # A tibble: 35 × 3
## # Groups:   YEAR [6]
##     YEAR AQI_Bucket   Mean_AQI
##    <dbl> <chr>           <dbl>
##  1  2015 Moderate        152. 
##  2  2015 Poor            249. 
##  3  2015 Satisfactory     82.0
##  4  2015 Severe          442. 
##  5  2015 Very Poor       348. 
##  6  2016 Good             40.5
##  7  2016 Moderate        125. 
##  8  2016 Poor            229. 
##  9  2016 Satisfactory     82.8
## 10  2016 Severe          455. 
## # ℹ 25 more rows
ggplot(AQI_City_Over_Years, aes(x = YEAR, y = Mean_AQI, color=AQI_Bucket))+  geom_line()

## We can see that the colder months - i.e., from Oct to Feb, the AQI is the worst, its bad during summer but it appears the best in monsoon season.

# Lets see the performance of the AQI over a day in every year

AQI_City_Over_Time <- AQ_city_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_City_Over_Time, aes(x = YEAR, y = Mean_AQI, color = Duration))+  geom_point(size = AQI_City_Over_Time$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## It appears 2015 had peak values of AQIs, which dropped to very low in 2016, gained to half the levels back in 2017 and then gradually reducing
## We can see that 2015-2017 worst was during day time but from 2018, there were worse night times - may be something to do with dropped levels of AQIs as well
## In all cases, early morning pollution seems to be the lowest.


# Lets see the performance of the AQI monthwise
AQI_City_monthwise <- AQ_city_Day_Duration %>% group_by(Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.
ggplot(AQI_City_monthwise, aes(x = Month, y = Mean_AQI, color = Duration))+  geom_point(size = AQI_City_monthwise$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## We can see that the winter months - i.e., from Oct to Feb, the AQI is the worst, its bad during summer but it appears the best in monsoon season. The difference between stationwise data is that, here Nov seems to be the worst month while in ther other dataset, Dec held the worst...

AQI_City_Over_month <- AQ_city_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR', 'Month'. You can override using the
## `.groups` argument.
AQI_City_Over_month
## # A tibble: 185 × 4
## # Groups:   YEAR, Month [63]
##     YEAR Month Duration      Mean_AQI
##    <dbl> <ord> <fct>            <dbl>
##  1  2015 Jan   Early_Morning     343.
##  2  2015 Jan   Day               341.
##  3  2015 Jan   Night             341.
##  4  2015 Feb   Early_Morning     329.
##  5  2015 Feb   Day               329.
##  6  2015 Feb   Night             325.
##  7  2015 Mar   Early_Morning     249.
##  8  2015 Mar   Day               262.
##  9  2015 Mar   Night             254.
## 10  2015 Apr   Early_Morning     304.
## # ℹ 175 more rows
# Lets see if how this works out yearwise and monthwise
ggplot(AQI_City_Over_month, aes(x = Month, y = Mean_AQI, color = Duration))+  geom_point(size = AQI_City_Over_month$Duration) + facet_wrap(~YEAR)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## We can see the same trend every year - i.e., the winter months has the worst AQI while the monsoon has the best AQI while summer/spring time having the intermediate values

## Now lets report this city wise - probably for the Month wise combination
AQI_Citywise <- AQ_city_Day_Sep %>% group_by(City, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE)) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'City', 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Citywise, aes(x = Month, y = Mean_AQI))+  geom_point(aes(color=City))

## Across stations, the trend seems to be the same - i.e., worst during winter, intermediate during spring/summer, best during monsoon.

## Now out of all the cities, we are very interested on Delhi for which we are going to do air traffic impact analysis - so lets filter them and zoom into their performance alone
AQI_Delhi_City <- AQ_city_Day_Sep %>% filter( City == "Delhi") %>% group_by(City, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE)) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'City', 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Delhi_City, aes(x = Month, y = Mean_AQI))+  geom_point() + facet_wrap(~YEAR)

New_AQ_city_day_Years <- New_AQ_city_day%>% group_by(City, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE)) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'City', 'YEAR'. You can override using the
## `.groups` argument.
head(New_AQ_city_day_Years)
## # A tibble: 6 × 4
## # Groups:   City, YEAR [2]
##   City       YEAR Month Mean_AQI
##   <chr>     <dbl> <ord>    <dbl>
## 1 Amaravati  2017 Nov      184. 
## 2 Amaravati  2017 Dec      194. 
## 3 Amaravati  2018 Jan      172. 
## 4 Amaravati  2018 Feb      107. 
## 5 Amaravati  2018 Mar       84.6
## 6 Amaravati  2018 Apr       63.8
ggplot(New_AQ_city_day_Years, aes(x = Month, y = Mean_AQI))+  geom_point(aes(color=City))

## There seems to be small difference when comparing hour wise data to day wise data, but not significant enough. So we will live with the hour wise data itself for cities.

Analyse the parameters impacting the AQI index

## We would like to understand which are the parameters are really affecting AQI value.
## Based on the analysis above we will stick to using the Cleaned Station hour wise datasets.

New_AQ_station_hour_sep <- New_AQ_station_hour %>% separate(Datetime, c('Date', 'Time'), sep =" ") %>% separate(Time, c('Hr', 'Min', 'Sec'), sep=":") %>% mutate(Hour = as.numeric(Hr), Month = month(ymd(Date)))

## Now lets focus on the months where we have the most troubles with AQI - Oct to Feb
New_AQ_station_hour_sep_BM <- New_AQ_station_hour_sep %>% filter ((Month == 1) | (Month == 2) | (Month == 10) | (Month == 11) | (Month == 12))

AQI_O3_model <- lm(AQI~O3, data = New_AQ_station_hour_sep)
fmodel(AQI_O3_model)

## OK vow, looks like AQI has direct relationship with the O3 content

AQI_O3_model_BM <- lm(AQI~O3, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_O3_model_BM)

## In bad months looks like O3 and AQI are inversely proportional


## Lets try with PM2.5
AQI_PM_2_5_model <- lm(AQI~PM2.5, data = New_AQ_station_hour_sep)
fmodel(AQI_PM_2_5_model)

## OK even here there is an impact - actually much more
AQI_PM_2_5_model_BM <- lm(AQI~PM2.5, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_PM_2_5_model_BM)

## PM2.5 impact seems to be much higher over the winter months

##Lets try others
AQI_PM_10_model <- lm(AQI~PM10, data = New_AQ_station_hour_sep)
fmodel(AQI_PM_10_model)

AQI_PM_10_model_BM <- lm(AQI~PM10, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_PM_10_model_BM)

## No significant impact change in winter months for PM10

AQI_NO_model <- lm(AQI~NO, data = New_AQ_station_hour_sep)
fmodel(AQI_NO_model)

AQI_NO_model_BM <- lm(AQI~NO, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NO_model_BM)

## Slight reduction in winter months for NO

AQI_NO2_model <- lm(AQI~NO2, data = New_AQ_station_hour_sep)
fmodel(AQI_NO2_model)

AQI_NO2_model_BM <- lm(AQI~NO2, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NO2_model_BM)

## No significant impact change in winter months for NO2

AQI_NOx_model <- lm(AQI~NOx, data = New_AQ_station_hour_sep)
fmodel(AQI_NOx_model)

AQI_NOx_model_BM <- lm(AQI~NOx, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NOx_model_BM)

## Slight reduction in winter months for NOx

AQI_NH3_model <- lm(AQI~NH3, data = New_AQ_station_hour_sep)
fmodel(AQI_NH3_model)

AQI_NH3_model_BM <- lm(AQI~NH3, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NH3_model_BM)

## PM2.5 impact seems to be much higher (50% more) over the winter months

AQI_CO_model <- lm(AQI~CO, data = New_AQ_station_hour_sep)
fmodel(AQI_CO_model)

AQI_CO_model_BM <- lm(AQI~CO, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_CO_model_BM)

## No significant impact change in winter months for CO

AQI_SO2_model <- lm(AQI~SO2, data = New_AQ_station_hour_sep)
fmodel(AQI_SO2_model)

AQI_SO2_model_BM <- lm(AQI~SO2, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_SO2_model_BM)

## Slight reduction in winter months for SO2

AQI_Benzene_model <- lm(AQI~Benzene, data = New_AQ_station_hour_sep)
fmodel(AQI_Benzene_model)

AQI_Benzene_model <- lm(AQI~Benzene, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_Benzene_model)

## Slight reduction in winter months for Benzene

AQI_Toluene_model <- lm(AQI~Toluene, data = New_AQ_station_hour_sep)
fmodel(AQI_Toluene_model)

AQI_Toluene_model_BM <- lm(AQI~Toluene, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_Toluene_model_BM)

## Slight reduction in winter months for Toulene

AQI_Xylene_model <- lm(AQI~Xylene, data = New_AQ_station_hour_sep)
fmodel(AQI_Xylene_model)

AQI_Xylene_model_BM <- lm(AQI~Xylene, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_Xylene_model_BM)

## No significant impact change in winter months for Xylene

## Among these, the highest impact seems to be from PM25.5, CO. Bringing in
## O3 due to their peculiar reversal in Winter months
AQI_High_Impact_model <- lm(AQI~PM2.5+O3+CO, data = New_AQ_station_hour_sep)
fmodel(AQI_High_Impact_model)

AQI_High_Impact_model_BM <- lm(AQI~PM2.5+O3+CO, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_High_Impact_model_BM)

Form a cohesive Delhi dataset

## We have seen how components of air impacted AQI
## Time to see the impact of weather on AQI by merging the station day wise data with the weather data
## Please note we are not picking up station hour wise data because the weather data we have is only day wise data

## Out of the cities for which weather has been provided, the only city that overlaps with the AQI data is Delhi
## And ofcourse we are trying to find the impact of AQI on Airtraffic in Delhi, so lets bring in that too
## So lets merge these three datasets only for Delhi

Delhi_AQI_data_temp <- New_AQ_station_day %>% filter ((StationId == "DL001") | (StationId == "DL019"))%>% mutate(Date_1 = ymd(as.Date(Date)))
Delhi_AQI_data <- Delhi_AQI_data_temp[, -2] %>% rename("Date" = "Date_1")

New_Weather_Delhi_day <- New_Weather_Delhi %>% mutate(Date = dmy(time))

Delhi_Airport_Delay_date <- New_Airport_delay %>% filter (Departure.Airport == "DEL") %>% mutate(Date_1 = dmy(Date))
Delhi_Airport_Delay_rename <- Delhi_Airport_Delay_date[, -1] %>% rename("Date" = "Date_1")
Delhi_Airport_Delay_date_sorted <- Delhi_Airport_Delay_rename[order(Delhi_Airport_Delay_rename$Date),]

## The range of weather data is from 01/01/1990 to 25/07/2022
## The range of airport delay data is from 28/01/18 to 27/1/2020

## So the overlapping range is from 1/11/2018 to 26/1/2020

Delhi_Airport_Delay_range <- Delhi_Airport_Delay_date_sorted %>% filter (between(Date, as.Date('2018-01-25'), as.Date('2020-01-27')))
#Delhi_Airport_Delay <- Delhi_Airport_Delay_dates %>% filter ((Date >'25-01-18') & (Date < '29-01-20'))  #1925
New_Weather_Delhi_day_range <- New_Weather_Delhi_day %>% filter (between(Date, as.Date('2018-01-25'), as.Date('2020-01-27')))

Delhi_AQI_data_range <- Delhi_AQI_data %>% filter (between(Date, as.Date('2018-01-25'), as.Date('2020-01-27')))

##Delhi_Airport_Delay data has multiple entries for a day as it is cutting across many airliners operating on an airport. But we are interested in average delay per day and not really on the airliner related information. So, lets clean the data a bit there.

convert_min <- function(x)
{
  if(x < 0)
  {
    time_mins = 0
  }
  else
  {
    time_d <- hms(x)
    time_mins <- hour(time_d)*60 + minute(time_d)
  }
}

Delhi_Airport_Delay_in_min <- Delhi_Airport_Delay_range %>% mutate (Departure_Delay_min = unlist(lapply(Departure.Delay, convert_min)), Arrival_Delay_min = unlist(lapply(Arrival.Time.Delay, convert_min)))
Delhi_Airport_Delay_datewise <- Delhi_Airport_Delay_in_min  %>% group_by(Date) %>% summarize(Daily_Delay = sum(Departure_Delay_min + Arrival_Delay_min))

Delhi_AQI_weather_data_merge_temp <- merge(New_Weather_Delhi_day, Delhi_AQI_data)
Delhi_AQI_weather_data_merge_temp_1 <- Delhi_AQI_weather_data_merge_temp[,-2]
Delhi_cohesive_dataset <- merge(Delhi_AQI_weather_data_merge_temp_1, Delhi_Airport_Delay_datewise)%>% mutate(Month = month(ymd(Date)))

head(Delhi_cohesive_dataset)
##         Date tavg tmin tmax prcp StationId PM2.5   PM10    NO   NO2   NOx   NH3
## 1 2018-06-27 30.3 26.2 37.5  3.0     DL019 48.03  89.10  4.09 39.86 24.30 18.91
## 2 2018-06-28 29.9 24.2 37.5 20.1     DL019 23.98  38.46  3.64 34.88 21.51 26.11
## 3 2018-06-29 30.7 27.9 35.2  1.0     DL019 34.77  60.62 11.30 53.24 37.53 38.76
## 4 2018-06-30 31.3 27.5 35.6  9.9     DL019 42.65 113.91  5.90 50.46 31.40 21.05
## 5 2018-07-04 31.7 26.1 36.7  5.1     DL019 44.09 138.82  2.30 34.02 19.90 27.99
## 6 2018-07-06 32.9 28.1 37.3  5.1     DL019 48.80 110.93  8.11 29.32 22.19 35.32
##     CO   SO2    O3 Benzene Toluene Xylene AQI   AQI_Bucket Daily_Delay Month
## 1 0.68 12.71  9.14    1.74   11.65   1.60  80 Satisfactory          42     6
## 2 0.52 11.41  6.42    2.10    8.95   1.23  55 Satisfactory         190     6
## 3 0.75  9.87 11.59   10.66   11.07   4.41  59 Satisfactory          66     6
## 4 0.91 12.43 10.90    8.03   15.18   4.50  92 Satisfactory          30     6
## 5 0.56  7.27 13.37    3.03   10.98   1.84 152     Moderate         120     7
## 6 0.53 10.14 15.38    3.08   16.27   2.48 104     Moderate          31     7
#Looks like the merger is successful with no NA
#Lets summarize full dataset
summary(Delhi_cohesive_dataset)
##       Date                 tavg            tmin            tmax      
##  Min.   :2018-06-27   Min.   :10.40   Min.   : 5.30   Min.   :14.60  
##  1st Qu.:2019-02-08   1st Qu.:18.30   1st Qu.:12.80   1st Qu.:25.30  
##  Median :2019-07-07   Median :28.20   Median :23.80   Median :33.80  
##  Mean   :2019-05-16   Mean   :25.29   Mean   :20.24   Mean   :30.94  
##  3rd Qu.:2019-08-25   3rd Qu.:30.90   3rd Qu.:26.20   3rd Qu.:36.00  
##  Max.   :2020-01-18   Max.   :35.60   Max.   :28.80   Max.   :43.40  
##       prcp         StationId             PM2.5             PM10       
##  Min.   : 0.000   Length:267         Min.   : 10.06   Min.   : 10.75  
##  1st Qu.: 0.000   Class :character   1st Qu.: 33.75   1st Qu.: 79.05  
##  Median : 0.500   Mode  :character   Median : 52.11   Median :113.91  
##  Mean   : 5.285                      Mean   : 69.21   Mean   :128.93  
##  3rd Qu.: 5.100                      3rd Qu.: 86.25   3rd Qu.:159.65  
##  Max.   :70.100                      Max.   :734.56   Max.   :830.10  
##        NO             NO2              NOx             NH3       
##  Min.   : 1.01   Min.   : 17.46   Min.   :10.37   Min.   : 6.79  
##  1st Qu.: 3.42   1st Qu.: 32.93   1st Qu.:22.20   1st Qu.:19.58  
##  Median : 7.12   Median : 41.93   Median :28.68   Median :24.64  
##  Mean   :11.75   Mean   : 44.17   Mean   :32.92   Mean   :26.77  
##  3rd Qu.:15.42   3rd Qu.: 51.61   3rd Qu.:39.95   3rd Qu.:30.82  
##  Max.   :67.89   Max.   :112.54   Max.   :94.31   Max.   :72.95  
##        CO              SO2               O3           Benzene      
##  Min.   :0.2800   Min.   : 2.100   Min.   : 1.19   Min.   : 0.020  
##  1st Qu.:0.6800   1st Qu.: 5.065   1st Qu.:11.60   1st Qu.: 0.580  
##  Median :0.8100   Median : 9.930   Median :18.62   Median : 1.210  
##  Mean   :0.8846   Mean   :10.781   Mean   :19.66   Mean   : 1.667  
##  3rd Qu.:0.9900   3rd Qu.:12.450   3rd Qu.:24.61   3rd Qu.: 2.355  
##  Max.   :2.8000   Max.   :32.840   Max.   :74.89   Max.   :10.660  
##     Toluene          Xylene           AQI         AQI_Bucket       
##  Min.   : 0.45   Min.   :0.000   Min.   : 40.0   Length:267        
##  1st Qu.: 5.40   1st Qu.:0.180   1st Qu.: 86.5   Class :character  
##  Median : 8.95   Median :0.600   Median :119.0   Mode  :character  
##  Mean   :10.77   Mean   :1.113   Mean   :156.3                     
##  3rd Qu.:14.84   3rd Qu.:1.600   3rd Qu.:202.0                     
##  Max.   :63.70   Max.   :9.880   Max.   :692.0                     
##   Daily_Delay         Month       
##  Min.   :  8.00   Min.   : 1.000  
##  1st Qu.: 42.00   1st Qu.: 3.000  
##  Median : 56.00   Median : 7.000  
##  Mean   : 73.72   Mean   : 6.341  
##  3rd Qu.: 81.00   3rd Qu.: 8.000  
##  Max.   :420.00   Max.   :12.000

Analyse cohesive dataset a bit to understand how delay and other parameters plot each other

names(Delhi_cohesive_dataset)
##  [1] "Date"        "tavg"        "tmin"        "tmax"        "prcp"       
##  [6] "StationId"   "PM2.5"       "PM10"        "NO"          "NO2"        
## [11] "NOx"         "NH3"         "CO"          "SO2"         "O3"         
## [16] "Benzene"     "Toluene"     "Xylene"      "AQI"         "AQI_Bucket" 
## [21] "Daily_Delay" "Month"
ggplot(Delhi_cohesive_dataset, aes(x = AQI, y = Daily_Delay, color = AQI_Bucket, size = prcp)) +
  geom_point() +
  labs(title = "Impact of AQI and prcp")

##As per the plot, Good AQI too gets observed for some delay cases but they are far and few... and does not seems to have caused high amount of delays 
## There area huge amount of delays caused for satisfactory AQI cases but most of the delays could be associated 
## with pretty high precipitation
## There are good amount of delays associate with moderate cases too and they do have caused significant delays when combined with high precipitations
## Delay instances reduces for Poor AQI cases but there is a slight increase in the values of delays
## For very poor cases, impact gets high when combined with precipitation
## Severe cases are high impact ones but looks like not affected with precipitation


## Now lets view this purely from the weather perspective
names(Delhi_cohesive_dataset)
##  [1] "Date"        "tavg"        "tmin"        "tmax"        "prcp"       
##  [6] "StationId"   "PM2.5"       "PM10"        "NO"          "NO2"        
## [11] "NOx"         "NH3"         "CO"          "SO2"         "O3"         
## [16] "Benzene"     "Toluene"     "Xylene"      "AQI"         "AQI_Bucket" 
## [21] "Daily_Delay" "Month"
ggplot(Delhi_cohesive_dataset, aes(x = tavg, y = Daily_Delay, color = tmin, size = prcp)) +
  geom_point() +
  labs(title = "Impact of temp and prcp")

## Its clear that bigger precipitation brings in more instances of delays
## But its also interesting to find that higher tavg, higher precipitation and higher tmin bring 
# in a lot of delays - though size of precipitation does not always result in costly delays

## Ok Lets also analyse if the components O3, PM2.5 and CO has impacts on delays

ggplot(Delhi_cohesive_dataset, aes(x = O3, y = Daily_Delay, size = O3)) +
  geom_point() +
  labs(title = "Impact of O3")

## Looks like more O3 directly relates to higher delays

ggplot(Delhi_cohesive_dataset, aes(x = PM2.5, y = Daily_Delay, size = PM2.5)) +
  geom_point() +
  labs(title = "Impact of PM2.5")

## Looks like more PM2.5 might not have too much impact...

ggplot(Delhi_cohesive_dataset, aes(x = CO, y = Daily_Delay, size = CO)) +
  geom_point() +
  labs(title = "Impact of CO")

## Looks like size of CO has some correlation but may not be linear...

ggplot(Delhi_cohesive_dataset, aes(x = PM10, y = Daily_Delay, size = PM10)) +
  geom_point() +
  labs(title = "Impact of PM10")

## Looks like more PM2.5 might not have too much impact...

ggplot(Delhi_cohesive_dataset, aes(x = prcp, y = Daily_Delay, size = prcp)) +
  geom_point() +
  labs(title = "Impact of rain")

## Looks like amount of rain has direct impact on delays...


ggplot(Delhi_cohesive_dataset, aes(x = tavg, y = Daily_Delay, size = tavg)) +
  geom_point() +
  labs(title = "Impact of Average Temp")

## Looks like a lot of low intensity delays on higher average temprature...

ggplot(Delhi_cohesive_dataset, aes(x = tmin, y = Daily_Delay, size = tmin)) +
  geom_point() +
  labs(title = "Impact of Tmin")

## Looks like a lot of low intensity delays on higher average temprature...

ggplot(Delhi_cohesive_dataset, aes(x = AQI, y = Daily_Delay, size = AQI, color=AQI_Bucket)) +
  geom_point() +
  labs(title = "Impact of AQI")

## Looks like a lot of low intensity delays on higher Tmin...

## Lets see if the months itself has any impact on the delay

ggplot(Delhi_cohesive_dataset, aes(x = Month, y = Daily_Delay, size = Daily_Delay)) +
  geom_point()+ scale_x_continuous(breaks=seq(1, 12, by = 1))+
  labs(title = "Impact of Month")

## Looks like there is high frequency of delays during monsoon and heavy delay during peak winter season


## Ok based on this, lets pick these elements to find the right model on impacts the delays of Delhi airtraffic:
## Precipitation, AQI, tmin, O3 and CO

## Lets see how the elements individually have linear regression relationship with the traffic delay


## Ok lets build the base model here
Delhi_Traffic_Delay_Model_AQI = lm(Daily_Delay ~ AQI, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_AQI)

Delhi_Traffic_Delay_Model_tavg = lm(Daily_Delay ~ AQI+tavg, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_tavg)

Delhi_Traffic_Delay_Model_prcp = lm(Daily_Delay ~ AQI+prcp, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_prcp)

Delhi_Traffic_Delay_Model_O3 = lm(Daily_Delay ~ AQI+O3, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_O3)

Delhi_Traffic_Delay_Model_CO = lm(Daily_Delay ~ AQI+CO, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_CO)

Delhi_Traffic_Delay_Model_Month = lm(Daily_Delay ~ AQI+Month, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_Month)

evaluate_model(Delhi_Traffic_Delay_Model_AQI)
##   AQI model_output
## 1   0     65.89103
## 2 200     75.90179
## 3 400     85.91255
evaluate_model(Delhi_Traffic_Delay_Model_tavg, tavg = 35)
##   AQI tavg model_output
## 1   0   35     65.28536
## 2 200   35     74.13083
## 3 400   35     82.97629
evaluate_model(Delhi_Traffic_Delay_Model_prcp, prcp = 150)
##   AQI prcp model_output
## 1   0  150     207.8647
## 2 200  150     221.3815
## 3 400  150     234.8983
evaluate_model(Delhi_Traffic_Delay_Model_O3, O3 = 50)
##   AQI O3 model_output
## 1   0 50     75.87323
## 2 200 50     86.52229
## 3 400 50     97.17135
evaluate_model(Delhi_Traffic_Delay_Model_CO, CO = 1)
##   AQI CO model_output
## 1   0  1     66.60121
## 2 200  1     76.02051
## 3 400  1     85.43981
evaluate_model(Delhi_Traffic_Delay_Model_Month, Month = 12)
##   AQI Month model_output
## 1   0    12     84.65495
## 2 200    12    100.87275
## 3 400    12    117.09055
diff_1 <- 75.90179 - 65.89103
diff_1
## [1] 10.01076
diff_2 <- 74.13083 - 65.28536   
diff_2
## [1] 8.84547
diff_3 <- 221.3815 - 207.8647
diff_3
## [1] 13.5168
diff_4 <- 86.52229 - 75.87323
diff_4
## [1] 10.64906
diff_5 <- 76.02051 - 66.60121
diff_5
## [1] 9.4193
diff_6 <- 100.87275 - 84.65495
diff_6
## [1] 16.2178
# Comparing the model evalution based on above, we can see that prcp, month and O3 has good impact
# on the delay

## To evaluate the base model, split the data into test and train datasets

#make this split reproducible
set.seed(1)

#Use 70% of dataset as training set and remaining 30% as testing set
sample_set <- sample(c(TRUE, FALSE), nrow(Delhi_cohesive_dataset), replace=TRUE, prob=c(0.7,0.3))
train_dataset  <- Delhi_cohesive_dataset[sample_set, ]
test_dataset   <- Delhi_cohesive_dataset[!sample_set, ]

# the base model with just AQI and tavg
Base_Model_Delay = lm(Daily_Delay ~ AQI+prcp, data = train_dataset)
# the Augmented model with precipitation as well
Aug_Model_Delay = lm(Daily_Delay ~ AQI+prcp+Month, data = train_dataset)
# Run cross validation trials on the two models
trials <- cv_pred_error(Base_Model_Delay, Aug_Model_Delay)


# Compare the two sets of cross-validated errors
t.test(mse ~ model, data = trials)
## 
##  Welch Two Sample t-test
## 
## data:  mse by model
## t = -2.8426, df = 5.7322, p-value = 0.03099
## alternative hypothesis: true difference in means between group Aug_Model_Delay and group Base_Model_Delay is not equal to 0
## 95 percent confidence interval:
##  -185.63143  -12.83777
## sample estimates:
##  mean in group Aug_Model_Delay mean in group Base_Model_Delay 
##                       3091.307                       3190.542
# t-statistic is -2.8426. degrees of freedom, df is 5.7322 are the degrees of freedom. These are used with a t-distribution to derive p-value of 0.03099

# p-value = 0.03099 - i.e., Given that there is no actual/true difference in means, if we repeat the experiment over and over again, 3.1% of the time we would see the type of difference in means as in your samples, or a more extreme difference in means. Since p value is significantly lower than 0.05, the differences are significant.
# So we can reject the null hypothesis (H0) of no difference between the (true) averages of the two groups
#alternative hypothesis: true difference in means is not equal to 0
#95 percent confidence interval:
# -185.63143 1-12.83777
#If assume H0 is false, the true mean may lie in the interval [3091.307 3190.542].
# So we will chose the augmented model - i.e., Daily_Delay ~ AQI+prcp+Mont

Model for predicting Delhi air traffic delays

## For our model to predict the air traffic delays:
## Response Variable is Daily_Delay
## Explanatory Variables are Precipitation (prcp), AQI and Month

## We are choosing a linear regression model here because this is about predicting the numerical values
## and does not belong to classification modelling
Delhi_Traffic_Delay_Model = lm(Daily_Delay ~ AQI+prcp+Month, data = train_dataset)
summary(Delhi_Traffic_Delay_Model)
## 
## Call:
## lm(formula = Daily_Delay ~ AQI + prcp + Month, data = train_dataset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -80.585 -30.554 -12.036   6.998 284.359 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 29.63157   11.80968   2.509  0.01296 * 
## AQI          0.08275    0.03793   2.182  0.03038 * 
## prcp         0.93401    0.41760   2.237  0.02651 * 
## Month        3.92586    1.23860   3.170  0.00179 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53.59 on 185 degrees of freedom
## Multiple R-squared:  0.08357,    Adjusted R-squared:  0.06871 
## F-statistic: 5.623 on 3 and 185 DF,  p-value: 0.001038
Predicted_Traffic_Delay <- predict(Delhi_Traffic_Delay_Model, test_dataset)
Predicted_Traffic_Delay
##         4         6         7        15        17        18        20        21 
##  70.04663  70.48225  66.13254  86.71768  80.13123 128.87608  69.57199 105.29059 
##        29        35        37        39        41        43        46        49 
##  69.53240  93.94147  98.30356 110.55783  98.28110  95.03004  68.54983  61.05614 
##        52        61        68        70        72        76        77        79 
##  57.72089  65.94984  48.48926  66.38734  71.30725  59.62546  57.14291  62.60452 
##        80        82        85        87        94        95        96        99 
##  48.07795  50.80630  48.92676  57.18521  55.47693  54.07015  54.31841  49.84982 
##       104       109       111       112       117       121       125       135 
##  75.11322  54.87521  55.84448  70.11429  65.36219  66.46084  72.88393  66.38080 
##       139       145       148       150       162       164       165       169 
##  74.59925  76.01754 112.17120  71.53428  66.28567  63.65000  64.39476  85.73501 
##       172       173       176       178       180       183       185       187 
##  65.75533  69.72740  67.32761  69.67660  70.51966  79.32889  64.51405  72.55232 
##       188       189       191       194       198       200       210       211 
##  71.22830 108.22343  70.51966  66.08633  70.72042  69.75935  78.49948  72.44394 
##       213       214       215       218       219       225       230       243 
##  72.24649  69.59844  72.19569  72.49474 102.32417  68.63737  76.00685  94.49699 
##       250       251       252       260       264       265 
## 134.81339  90.11819  89.78719  53.16956  62.75636  61.51509
test_dataset["Predicted_Delay"] <- Predicted_Traffic_Delay

Summary_Model_Performace <- test_dataset %>% group_by(YEAR = year(ymd(Date)), Month) %>% summarise(Daily_Delay, Predicted_Delay)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'YEAR', 'Month'. You can override using the
## `.groups` argument.
Summary_Model_Performace
## # A tibble: 78 × 4
## # Groups:   YEAR, Month [18]
##     YEAR Month Daily_Delay Predicted_Delay
##    <dbl> <dbl>       <dbl>           <dbl>
##  1  2018     6          30            70.0
##  2  2018     7          31            70.5
##  3  2018     7           8            66.1
##  4  2018     7          59            86.7
##  5  2018     7          45            80.1
##  6  2018     7          62           129. 
##  7  2018     7         100            69.6
##  8  2018     7          61           105. 
##  9  2018     8         105            69.5
## 10  2018    11          60            93.9
## # ℹ 68 more rows
ggplot(Summary_Model_Performace, aes(x = Month)) +
        geom_point(aes(y = Daily_Delay, color = 'Daily_Delay')) +
        geom_point(aes(y = Predicted_Delay, color = 'Predictede_Delay')) +
         scale_x_continuous(breaks=seq(1, 12, by = 1))+
  labs(title = "Model Performance")  + facet_wrap(~YEAR)

# As we can see, the model is performing a bit OK for some months except for certain extreme
# cases of delays. So, the model needs further fine tuning or dataset needs to be reanalyzed.